diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh
index 36f8c6473..ad6ca0f13 100644
--- a/examples/6_evaluate_libero.sh
+++ b/examples/6_evaluate_libero.sh
@@ -1,14 +1,45 @@
 #!/bin/bash
 
-unset LEROBOT_HOME
-unset HF_LEROBOT_HOME
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+export HF_DATASETS_OFFLINE=1
+export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+export CUDA_VISIBLE_DEVICES=3
+
 # CONFIGURATION
-POLICY_PATH="bicmol/smolvla-libero"
+POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model"
+POLICY_PATH="/raid/jade/models/smolvlamust"
 TASK=libero_spatial
 ENV_TYPE="libero"
-BATCH_SIZE=1
-N_EPISODES=1
+BATCH_SIZE=10
+N_EPISODES=10
+# storage / caches
+RAID=/raid/jade
+N_ACTION_STEPS=1
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+# export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
 export MUJOCO_GL=egl
+export MUJOCO_GL=egl
+unset HF_HUB_OFFLINE
 # RUN EVALUATION
 python src/lerobot/scripts/eval.py \
     --policy.path="$POLICY_PATH" \
@@ -17,3 +48,11 @@ python src/lerobot/scripts/eval.py \
     --eval.n_episodes="$N_EPISODES" \
     --env.multitask_eval=False \
     --env.task=$TASK \
+# python examples/evaluate_libero.py \
+#     --policy_path "$POLICY_PATH" \
+#     --task_suite_name "$TASK" \
+#     --num_steps_wait 10 \
+#     --num_trials_per_task 10 \
+#     --video_out_path "data/libero/videos" \
+#     --device "cuda" \
+#     --seed 7
\ No newline at end of file
diff --git a/examples/6_evaluate_libero_2.sh b/examples/6_evaluate_libero_2.sh
new file mode 100644
index 000000000..9d05c0330
--- /dev/null
+++ b/examples/6_evaluate_libero_2.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+export HF_DATASETS_OFFLINE=1
+export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+export CUDA_VISIBLE_DEVICES=3
+
+# CONFIGURATION
+POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model"
+POLICY_PATH="AustineJohnBreaker/smolvla_stratch_libero_spatial"
+TASK=libero_spatial
+ENV_TYPE="libero"
+BATCH_SIZE=10
+N_EPISODES=10
+USE_AMP=false
+N_ACTION_STEPS=1
+SELF_ATTN_EVERY_N_LAYERS=2
+VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+PAD_LANG_TO=longest
+LOAD_VLM_WEIGHTS=true
+NUM_VLM_LAYERS=16
+CHUNK_SIZE=50
+N_OBS_STEPS=1
+NUM_EXPERT_LAYERS=0
+EXPERT_WIDTH_MULTIPLIER=0.5
+
+
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+# export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+export MUJOCO_GL=egl
+ADD_IMAGE_TOKENS=true
+unset HF_HUB_OFFLINE
+# RUN EVALUATION
+python src/lerobot/scripts/eval.py \
+    --policy.path="$POLICY_PATH" \
+    --env.type="$ENV_TYPE" \
+    --eval.batch_size="$BATCH_SIZE" \
+    --eval.n_episodes="$N_EPISODES" \
+    --env.multitask_eval=False \
+    --env.task=$TASK \
+    --policy.use_amp=$USE_AMP \
+    --policy.n_action_steps=$N_ACTION_STEPS \
+    # --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \
+    --policy.attention_mode=$ATTN_MODE \
+    --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+    --policy.vlm_model_name=$VLM_NAME \
+    --policy.pad_language_to=$PAD_LANG_TO \
+    --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
+    --policy.num_vlm_layers=$NUM_VLM_LAYERS \
+    --policy.chunk_size=$CHUNK_SIZE \
+    --policy.n_obs_steps=$N_OBS_STEPS \
+    --policy.num_expert_layers=$NUM_EXPERT_LAYERS \
+    --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
diff --git a/examples/7_train_acc.sh b/examples/7_train_acc.sh
new file mode 100644
index 000000000..27f445143
--- /dev/null
+++ b/examples/7_train_acc.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# smolvla training with accelerate
+
+set -euo pipefail
+
+# repo/env
+cd ~/lerobot || exit 1
+# conda activate lerobot
+export LC_ALL=C
+
+rm -f core-*
+
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+export HF_DATASETS_OFFLINE=1
+export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+
+# CONFIG
+ENV=libero
+TASK=libero_spatial
+REPO_ID=physical-intelligence/libero
+
+POLICY=smolvla
+VLM=HuggingFaceTB/SmolVLM2-500M-Instruct
+
+# Optim / scheduling
+LR=1e-4
+DECAY_LR=2.5e-6
+DECAY_STEPS=30000
+USE_AMP=true   # set to true for mixed precision
+TRAIN_EXPERT_ONLY=true
+N_ACTION_STEPS=1
+SEED=1000
+
+# Training loop
+OFFLINE_STEPS=100000
+BATCH_SIZE=32
+EVAL_FREQ=0
+SAVE_FREQ=20000
+EVAL_BATCH_SIZE=1
+NUM_EPISODES=1
+
+# number of gpus to use
+NUM_PROCESSES=2
+export CUDA_VISIBLE_DEVICES=1,3
+PORT=29522
+
+# naming/output dir
+TRAIN_DIR=$RAID/logs/lerobot/lerobot_2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
+echo "Training dir: $TRAIN_DIR"
+
+rm -rf "$TRAIN_DIR"
+
+# RUN
+python -m accelerate.commands.launch \
+  --num_processes $NUM_PROCESSES \
+  --num_machines 1 \
+  --main_process_port $PORT \
+  --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \
+  src/lerobot/scripts/train_accelerate.py \
+    --policy.type=$POLICY \
+    --policy.use_amp=True \
+    --policy.vlm_model_name=$VLM \
+    --dataset.repo_id=$REPO_ID \
+    --dataset.root=$HF_DATASETS_CACHE \
+    --env.type=$ENV \
+    --env.task=$TASK \
+    --output_dir=$TRAIN_DIR \
+    --batch_size=$BATCH_SIZE \
+    --steps=$OFFLINE_STEPS \
+    --eval_freq=$EVAL_FREQ \
+    --save_freq=$SAVE_FREQ \
+    --eval.batch_size=$EVAL_BATCH_SIZE \
+    --eval.n_episodes=$NUM_EPISODES \
+    --policy.optimizer_lr=$LR \
+    --policy.repo_id=None \
+    --policy.scheduler_decay_lr=$DECAY_LR \
+    --policy.scheduler_decay_steps=$DECAY_STEPS \
+    --policy.n_action_steps=$N_ACTION_STEPS \
+    --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
+    --policy.vlm_model_name=$VLM \
+    --seed=$SEED \
+    --wandb.enable=false
diff --git a/examples/7_train_libero_smolvla.sh b/examples/7_train_libero_smolvla.sh
index 3943e3c96..f0b9de4e5 100644
--- a/examples/7_train_libero_smolvla.sh
+++ b/examples/7_train_libero_smolvla.sh
@@ -21,8 +21,8 @@ export WANDB_CACHE_DIR=$RAID/.cache/wandb
 export TMPDIR=$RAID/.cache/tmp
 mkdir -p $TMPDIR
 export WANDB_MODE=offline
-export HF_DATASETS_OFFLINE=1
-export HF_HUB_OFFLINE=1
+# export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
 export TOKENIZERS_PARALLELISM=false
 export MUJOCO_GL=egl
 
@@ -31,11 +31,11 @@ PORT=29522
 
 # =================== CONFIG ===================
 ENV=libero
-TASK=libero_spatial
+TASK=libero_object
 REPO_ID=physical-intelligence/libero
-
+ROOT=$RAID
 POLICY=smolvla
-VLM=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+VLM=HuggingFaceTB/SmolVLM2-500M-Instruct
 
 # Optim / scheduling
 LR=1e-4
@@ -55,10 +55,10 @@ EVAL_BATCH_SIZE=1
 NUM_EPISODES=1
 
 # GPU selection 0, 1, 2, 3
-export CUDA_VISIBLE_DEVICES=1
+export CUDA_VISIBLE_DEVICES=0
 
 # naming/output dir
-TRAIN_DIR=$RAID/logs/lerobot/lerobot_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
+TRAIN_DIR=$RAID/logs/lerobot/lerobot_solo_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
 echo "Training dir: $TRAIN_DIR"
 
 # train
@@ -68,7 +68,6 @@ python src/lerobot/scripts/train.py \
   --policy.type=$POLICY \
   --policy.vlm_model_name=$VLM \
   --dataset.repo_id=$REPO_ID \
-  --dataset.root=$HF_DATASETS_CACHE \
   --env.type=$ENV \
   --env.task=$TASK \
   --output_dir=$TRAIN_DIR \
@@ -85,6 +84,6 @@ python src/lerobot/scripts/train.py \
   --policy.scheduler_decay_steps=$DECAY_STEPS \
   --policy.n_action_steps=$N_ACTION_STEPS \
   --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
-  --policy.vlm_model_name=/raid/jade/.cache/huggingface/models/SmolVLM2-2.2B-Instruct \
+  --policy.vlm_model_name=$VLM \
   --seed=$SEED \
   --wandb.enable=false
diff --git a/examples/8_train_smolvla_must.sh b/examples/8_train_smolvla_must.sh
new file mode 100644
index 000000000..98029f3ad
--- /dev/null
+++ b/examples/8_train_smolvla_must.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+# smolvla training with accelerate
+
+set -euo pipefail
+
+# repo/env
+cd ~/lerobot || exit 1
+# conda activate lerobot
+export LC_ALL=C
+
+rm -f core-*
+
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+# export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+
+# CONFIG
+ENV=libero
+TASK=libero_spatial
+REPO_ID=HuggingfaceVLA/libero
+
+POLICY=smolvla
+VLM=HuggingFaceTB/SmolVLM2-500M-Instruct
+
+# Optim / scheduling
+LR=1e-4
+DECAY_LR=2.5e-6
+DECAY_STEPS=30000
+USE_AMP=true   # set to true for mixed precision
+TRAIN_EXPERT_ONLY=true
+N_ACTION_STEPS=1
+SEED=1000
+LOAD_VLM_WEIGHTS=true
+# Training loop
+OFFLINE_STEPS=100000
+BATCH_SIZE=32
+EVAL_FREQ=0
+SAVE_FREQ=20000
+EVAL_BATCH_SIZE=1
+NUM_EPISODES=1
+ADD_IMAGE_TOKENS=tru
+N_OBS_STEPS=1
+ATTN_MODE=cross_attn
+EXPERT_WIDTH_MULTIPLIER=0.5
+# number of gpus to use
+NUM_PROCESSES=2
+NUM_VLM_LAYERS=0
+SELF_ATTN_EVERY_N_LAYERS=2
+CHUNK_SIZE=50
+export CUDA_VISIBLE_DEVICES=0
+PORT=29522
+PREFIX_LENGTH=0
+LOAD_VLM_WEIGHTS=true
+# naming/output dir
+TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
+echo "Training dir: $TRAIN_DIR"
+
+rm -rf "$TRAIN_DIR"
+
+# RUN
+# python -m accelerate.commands.launch \
+#   --num_processes $NUM_PROCESSES \
+#   --num_machines 1 \
+#   --main_process_port $PORT \
+#   --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \
+#   src/lerobot/scripts/train_accelerate.py \
+#     --policy.type=$POLICY \
+#     --policy.use_amp=True \
+#     --policy.vlm_model_name=$VLM \
+#     --dataset.repo_id=$REPO_ID \
+#     --dataset.root=$HF_DATASETS_CACHE \
+#     --env.type=$ENV \
+#     --env.task=$TASK \
+#     --output_dir=$TRAIN_DIR \
+#     --batch_size=$BATCH_SIZE \
+#     --steps=$OFFLINE_STEPS \
+#     --eval_freq=$EVAL_FREQ \
+#     --save_freq=$SAVE_FREQ \
+#     --eval.batch_size=$EVAL_BATCH_SIZE \
+#     --eval.n_episodes=$NUM_EPISODES \
+#     --policy.optimizer_lr=$LR \
+#     --policy.repo_id=None \
+#     --policy.scheduler_decay_lr=$DECAY_LR \
+#     --policy.scheduler_decay_steps=$DECAY_STEPS \
+#     --policy.n_action_steps=$N_ACTION_STEPS \
+#     --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
+#     --policy.vlm_model_name=$VLM \
+#     --policy.n_obs_steps=$N_OBS_STEPS \
+#     --policy.attention_mode=$ATTN_MODE \
+#     --policy.prefix_length=$PREFIX_LENGTH \
+#     --policy.num_vlm_layers=$NUM_VLM_LAYERS \
+#     --policy.chunk_size=$CHUNK_SIZE \
+#     --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
+#     --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+#     --seed=$SEED \
+#     --wandb.enable=false
+
+
+python src/lerobot/scripts/train.py \
+    --policy.type=$POLICY \
+    --policy.use_amp=False \
+    --policy.vlm_model_name=$VLM \
+    --dataset.repo_id=$REPO_ID \
+    --dataset.root='/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data' \
+    --env.type=$ENV \
+    --env.task=$TASK \
+    --output_dir=$TRAIN_DIR \
+    --batch_size=$BATCH_SIZE \
+    --steps=$OFFLINE_STEPS \
+    --eval_freq=$EVAL_FREQ \
+    --save_freq=$SAVE_FREQ \
+    --eval.batch_size=$EVAL_BATCH_SIZE \
+    --eval.n_episodes=$NUM_EPISODES \
+    --policy.optimizer_lr=$LR \
+    --policy.repo_id=None \
+    --policy.scheduler_decay_lr=$DECAY_LR \
+    --policy.scheduler_decay_steps=$DECAY_STEPS \
+    --policy.n_action_steps=$N_ACTION_STEPS \
+    --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
+    --policy.vlm_model_name=$VLM \
+    --policy.n_obs_steps=$N_OBS_STEPS \
+    --policy.attention_mode=$ATTN_MODE \
+    --policy.prefix_length=$PREFIX_LENGTH \
+    --policy.num_vlm_layers=$NUM_VLM_LAYERS \
+    --policy.chunk_size=$CHUNK_SIZE \
+    --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
+    --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
+    --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+    --seed=$SEED \
+    --wandb.enable=false
diff --git a/examples/checker.py b/examples/checker.py
new file mode 100644
index 000000000..12377e9b9
--- /dev/null
+++ b/examples/checker.py
@@ -0,0 +1,27 @@
+from huggingface_hub import HfApi
+api = HfApi()
+# api.upload_large_folder(
+#     repo_id="HuggingFaceVLA/libero",
+#     repo_type="dataset",
+#     folder_path="/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero",
+# )
+api.upload_large_folder(
+    repo_id="HuggingFaceVLA/metaworld_mt50",
+    repo_type="dataset",
+    folder_path="/raid/jade/.cache/huggingface/lerobot/metaworld_mt50",
+)
+# repo_id="HuggingFaceVLA/libero"
+# # Upload extra files
+# api.upload_file(
+#     repo_id=repo_id,
+#     repo_type="dataset",
+#     path_or_fileobj="/raid/jade/libero_converted/README.md",
+#     path_in_repo="README.md"
+# )
+
+# api.upload_folder(
+#     repo_id=repo_id,
+#     repo_type="dataset",
+#     folder_path="/raid/jade/libero_converted/meta",
+#     path_in_repo="meta"
+# )
diff --git a/examples/checker2.py b/examples/checker2.py
new file mode 100644
index 000000000..a5825d87f
--- /dev/null
+++ b/examples/checker2.py
@@ -0,0 +1,35 @@
+import pyarrow.parquet as pq
+
+# # First parquet (cached HF version)
+meta1 = pq.read_metadata("/raid/jade/.cache/huggingface/datasets/data/chunk-000/episode_000000.parquet")
+meta1 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet")
+print("First parquet key_value_metadata:")
+print(meta1.metadata)  # low-level file metadata
+# print()
+print("Second")
+# Second parquet (your converted version)
+meta2 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet")
+print("\nSecond parquet key_value_metadata:")
+# print(meta2.metadata)
+
+# from datasets import load_dataset
+# root_dir = "/raid/jade/libero_converted"
+
+# # Load all parquet files under the root_dir recursively
+# ds = load_dataset("parquet", data_files=f"{root_dir}/**/*.parquet")
+
+# print(ds)                 # prints split info
+# print(ds["train"].features)  # check schema/features
+
+# # Peek at one row
+# example = ds["train"][0]
+# print(example.keys())
+# print(type(example["observation.images.image"]))
+# print(type(example["observation.images.image2"]))
+
+import pyarrow.parquet as pq
+
+for ep in ["episode_000019.parquet", "episode_000021.parquet", "episode_000026.parquet"]:
+    path = f"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/{ep}"
+    schema = pq.read_schema(path)
+    print(ep, schema.names)
diff --git a/examples/convert_data.py b/examples/convert_data.py
new file mode 100644
index 000000000..96ce58cb1
--- /dev/null
+++ b/examples/convert_data.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+Convert local LeRobot datasets from v2.0 to v2.1 format.
+This script adapts the official converter to work with local datasets.
+"""
+
+import sys
+import argparse
+import logging
+from pathlib import Path
+
+# Add lerobot to path
+sys.path.insert(0, '/home/jade_choghari/lerobot/src')
+
+from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
+from lerobot.datasets.utils import EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info
+from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def convert_local_dataset(
+    dataset_path: str,
+    num_workers: int = 4,
+    skip_if_converted: bool = True
+):
+    """
+    Convert a local dataset from v2.0 to v2.1 format.
+    
+    Args:
+        dataset_path: Path to the local dataset directory
+        num_workers: Number of workers for parallel processing
+        skip_if_converted: Skip if already has episodes_stats.jsonl
+    """
+    dataset_path = Path(dataset_path)
+    
+    print(f"🔄 Converting local dataset: {dataset_path}")
+    
+    # Check if already converted
+    episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl"
+    if episodes_stats_path.exists() and skip_if_converted:
+        # Check if file is empty
+        file_size = episodes_stats_path.stat().st_size
+        if file_size == 0:
+            print(f"  ⚠️  episodes_stats.jsonl is empty, will regenerate")
+        else:
+            # Check if file has content
+            with open(episodes_stats_path, 'r') as f:
+                content = f.read().strip()
+                if not content:
+                    print(f"  ⚠️  episodes_stats.jsonl has no content, will regenerate")
+                else:
+                    print(f"  ⏭️  Already has episodes_stats.jsonl, skipping")
+                    return True
+    
+    try:
+        # Check if this is a v2.0 dataset that needs conversion
+        episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl"
+        stats_path = dataset_path / "meta" / "stats.json"
+        
+        if not episodes_stats_path.exists() and stats_path.exists():
+            print(f"  🔄 Detected v2.0 dataset, creating temporary episodes_stats.jsonl...")
+            # Create empty episodes_stats.jsonl to allow loading
+            episodes_stats_path.touch()
+            created_temp_file = True
+        else:
+            created_temp_file = False
+        
+        # Load dataset from local path with pyav video backend
+        print(f"  📂 Loading dataset from local path...")
+        # Use a dummy repo_id since we're loading locally
+        dummy_repo_id = f"{dataset_path.parent.name}/{dataset_path.name}"
+        dataset = LeRobotDataset(
+            dummy_repo_id, 
+            root=str(dataset_path), 
+            # video_backend="pyav",
+            # local_files_only=True
+        )
+        
+        # Remove temporary file if we created it
+        if created_temp_file and episodes_stats_path.exists() and episodes_stats_path.stat().st_size == 0:
+            episodes_stats_path.unlink()
+            print(f"  🗑️  Removed temporary episodes_stats.jsonl")
+        
+        # Remove existing episodes_stats if present (ensure clean conversion)
+        episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl"
+        if episodes_stats_path.exists():
+            episodes_stats_path.unlink()
+            print(f"  🗑️  Removed existing episodes_stats.jsonl")
+        
+        # Check if video directory exists before conversion
+        videos_dir = dataset_path / "videos"
+        if not videos_dir.exists():
+            print(f"  ⚠️  No videos directory found - will skip video statistics")
+        
+        # Convert stats
+        print(f"  📊 Computing episode statistics...")
+        convert_stats(dataset, num_workers=num_workers)
+        
+        # Load reference stats for validation if they exist
+        stats_path = dataset.root / STATS_PATH
+        if stats_path.exists():
+            print(f"  ✅ Validating against reference stats...")
+            try:
+                ref_stats = load_stats(dataset.root)
+                check_aggregate_stats(dataset, ref_stats)
+                print(f"  ✅ Stats validation passed!")
+            except AssertionError as e:
+                print(f"  ⚠️  Stats validation failed with minor differences: {e}")
+                print(f"  ⚠️  This is likely due to floating-point precision, continuing anyway...")
+                # Check if the error is just a small numerical difference
+                if "Max absolute difference:" in str(e) and "Max relative difference:" in str(e):
+                    print(f"  ✅ Treating as acceptable numerical precision difference")
+                else:
+                    raise e
+            
+            # Remove old stats.json file
+            print(f"  🗑️  Removing old stats.json")
+            stats_path.unlink()
+        else:
+            print(f"  ⚠️  No reference stats found, skipping validation")
+        
+        # Update codebase version
+        dataset.meta.info["codebase_version"] = CODEBASE_VERSION
+        write_info(dataset.meta.info, dataset.root)
+        
+        print(f"  ✅ Successfully converted to v2.1")
+        return True
+        
+    except Exception as e:
+        print(f"  ❌ Failed to convert: {e}")
+        logger.exception("Conversion failed")
+        return False
+
+def convert_multiple_datasets(
+    base_dirs: list[str],
+    max_datasets: int = None,
+    num_workers: int = 4
+):
+    """Convert multiple datasets from base directories."""
+    
+    datasets_to_convert = []
+    
+    # Scan for datasets needing conversion
+    for base_dir in base_dirs:
+        base_path = Path(base_dir)
+        if not base_path.exists():
+            print(f"⚠️  Directory not found: {base_dir}")
+            continue
+        
+        print(f"🔍 Scanning: {base_dir}")
+        
+        # Walk through author/dataset structure
+        for author_dir in sorted(base_path.iterdir()):
+            if not author_dir.is_dir():
+                continue
+            
+            for dataset_dir in sorted(author_dir.iterdir()):
+                if not dataset_dir.is_dir():
+                    continue
+                
+                # Check if needs conversion
+                episodes_stats_path = dataset_dir / "meta" / "episodes_stats.jsonl" 
+                info_path = dataset_dir / "meta" / "info.json"
+                
+                needs_conversion = False
+                if info_path.exists():
+                    if not episodes_stats_path.exists():
+                        needs_conversion = True
+                        print(f"  📝 Found (missing): {author_dir.name}/{dataset_dir.name}")
+                    else:
+                        # Check if episodes_stats file is empty
+                        try:
+                            file_size = episodes_stats_path.stat().st_size
+                            if file_size == 0:
+                                needs_conversion = True
+                                print(f"  📝 Found (empty): {author_dir.name}/{dataset_dir.name}")
+                            else:
+                                # Check if file has content
+                                with open(episodes_stats_path, 'r') as f:
+                                    content = f.read().strip()
+                                    if not content:
+                                        needs_conversion = True
+                                        print(f"  📝 Found (no content): {author_dir.name}/{dataset_dir.name}")
+                        except Exception as e:
+                            # If we can't read the file, consider it needs conversion
+                            needs_conversion = True
+                            print(f"  📝 Found (read error): {author_dir.name}/{dataset_dir.name}")
+                
+                if needs_conversion:
+                    datasets_to_convert.append(dataset_dir)
+    
+    if not datasets_to_convert:
+        print("🎉 No datasets need conversion!")
+        return
+    
+    if max_datasets:
+        datasets_to_convert = datasets_to_convert[:max_datasets]
+    
+    print(f"\n🚀 Converting {len(datasets_to_convert)} datasets...")
+    
+    successful = 0
+    failed = 0
+    
+    for i, dataset_path in enumerate(datasets_to_convert, 1):
+        print(f"\n[{i}/{len(datasets_to_convert)}] {dataset_path.parent.name}/{dataset_path.name}")
+        
+        success = convert_local_dataset(dataset_path, num_workers=num_workers)
+        if success:
+            successful += 1
+        else:
+            failed += 1
+    
+    print(f"\n📊 Conversion Summary:")
+    print(f"   ✅ Successful: {successful}")
+    print(f"   ❌ Failed: {failed}")
+    print(f"   📈 Success rate: {successful}/{len(datasets_to_convert)} ({100*successful/len(datasets_to_convert):.1f}%)")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert local LeRobot datasets to v2.1 format")
+    parser.add_argument("--dataset", type=str, help="Single dataset path to convert")
+    parser.add_argument("--base-dirs", nargs="+", 
+                       default=["/fsx/dana_aubakirova/vla/community_dataset_v1"],
+                       help="Base directories to scan for datasets")
+    parser.add_argument("--max-datasets", type=int, help="Maximum number of datasets to convert")
+    parser.add_argument("--num-workers", type=int, default=4, help="Number of workers for stats computation")
+    parser.add_argument("--all", action="store_true", help="Convert all datasets in base directories")
+    
+    args = parser.parse_args()
+    
+    if args.dataset:
+        # Convert single dataset
+        success = convert_local_dataset(args.dataset, num_workers=args.num_workers)
+        if success:
+            print(f"\n🎉 Successfully converted: {args.dataset}")
+        else:
+            print(f"\n💥 Failed to convert: {args.dataset}")
+            sys.exit(1)
+    
+    elif args.all:
+        # Convert all datasets
+        convert_multiple_datasets(
+            args.base_dirs, 
+            max_datasets=args.max_datasets,
+            num_workers=args.num_workers
+        )
+    
+    else:
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/convert_libero.py b/examples/convert_libero.py
new file mode 100644
index 000000000..7bfc50eae
--- /dev/null
+++ b/examples/convert_libero.py
@@ -0,0 +1,126 @@
+import os
+import pyarrow.parquet as pq
+import tempfile
+import shutil
+
+# Root directory of converted data
+root_dir = "/raid/jade/libero_converted"
+
+# No renaming
+rename_map = {
+    
+}
+
+# Hugging Face features metadata (constant across all files)
+HF_METADATA = {
+    b"huggingface": b'{"info": {"features": {"observation.images.image": {"_type": "Image"}, "observation.images.image2": {"_type": "Image"}, "state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, "actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, "timestamp": {"dtype": "float32", "_type": "Value"}, "frame_index": {"dtype": "int64", "_type": "Value"}, "episode_index": {"dtype": "int64", "_type": "Value"}, "index": {"dtype": "int64", "_type": "Value"}, "task_index": {"dtype": "int64", "_type": "Value"}}}}'
+}
+
+def patch_parquet(parquet_path, hf_metadata):
+    try:
+        table = pq.read_table(parquet_path)
+
+        # Merge metadata
+        new_meta = dict(table.schema.metadata or {})
+        new_meta.update(hf_metadata)
+
+        # Apply metadata to table
+        table = table.replace_schema_metadata(new_meta)
+
+        # Write safely via temp file
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
+        os.close(tmp_fd)
+        pq.write_table(table, tmp_path)
+        shutil.move(tmp_path, parquet_path)
+
+        print(f"✅ Patched: {parquet_path}")
+        return True
+    except Exception as e:
+        print(f"❌ Failed on {parquet_path}: {e}")
+        return False
+
+# Walk through all chunk dirs and patch parquet files
+for dirpath, _, filenames in os.walk(root_dir):
+    for fname in filenames:
+        if fname.endswith(".parquet"):
+            fpath = os.path.join(dirpath, fname)
+            patch_parquet(fpath, HF_METADATA)#!/usr/bin/env python3
+
+#!/usr/bin/env python3
+import os
+import pyarrow.parquet as pq
+import tempfile
+import shutil
+
+# Explicit list of files to patch
+FILES_TO_PATCH = [
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet",
+]
+
+# Optional renaming map (fill in as needed)
+rename_map = {
+    # "old_column_name": "new_column_name",
+    "image": "observation.images.image",
+    "image2": "observation.images.image2",
+    "actions": "action",
+}
+
+# Hugging Face features metadata (constant across all files)
+HF_METADATA = {
+    b"huggingface": b'{"info": {"features": {'
+    b'"observation.images.image": {"_type": "Image"}, '
+    b'"observation.images.image2": {"_type": "Image"}, '
+    b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
+    b'"actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
+    b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
+    b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"index": {"dtype": "int64", "_type": "Value"}, '
+    b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
+}
+
+def patch_parquet(parquet_path, hf_metadata, rename_map):
+    try:
+        # Load parquet table
+        table = pq.read_table(parquet_path)
+
+        # If renaming is needed
+        if rename_map:
+            schema = table.schema
+            new_names = [
+                rename_map.get(name, name) for name in schema.names
+            ]
+            table = table.rename_columns(new_names)
+
+        # Merge schema metadata
+        new_meta = dict(table.schema.metadata or {})
+        new_meta.update(hf_metadata)
+
+        # Replace metadata in table
+        table = table.replace_schema_metadata(new_meta)
+
+        # Write safely via temp file
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
+        os.close(tmp_fd)
+        pq.write_table(table, tmp_path)
+
+        # Replace original file
+        shutil.move(tmp_path, parquet_path)
+
+        print(f"✅ Patched: {parquet_path}")
+        return True
+    except Exception as e:
+        print(f"❌ Failed on {parquet_path}: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    for fpath in FILES_TO_PATCH:
+        if os.path.exists(fpath):
+            patch_parquet(fpath, HF_METADATA, rename_map)
+        else:
+            print(f"⚠️ File not found: {fpath}")
diff --git a/examples/evaluate_libero.py b/examples/evaluate_libero.py
new file mode 100644
index 000000000..bf99994b6
--- /dev/null
+++ b/examples/evaluate_libero.py
@@ -0,0 +1,255 @@
+"""
+This script demonstrates how to evaluate a pretrained smolVLA policy on the LIBERO benchmark.
+"""
+
+import collections
+import dataclasses
+import logging
+import math
+import pathlib
+
+import cv2
+import draccus
+import imageio
+import numpy as np
+import torch
+from libero.libero import benchmark, get_libero_path
+from libero.libero.envs import OffScreenRenderEnv
+from tqdm import tqdm
+
+from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
+from lerobot.policies.pi0.modeling_pi0 import PI0Policy
+
+LIBERO_DUMMY_ACTION = [0.0] * 6 + [-1.0]
+LIBERO_ENV_RESOLUTION = 256  # resolution used to render training data
+
+@dataclasses.dataclass
+class Args:
+    """
+    Evaluation arguments for smolVLA on LIBERO.
+    """
+
+    # --- Hugging Face arguments ---
+    policy_path: str = "lerobot/smolvla_base"
+    """Path to the pretrained policy on the Hugging Face Hub or local directory."""
+
+    # --- LIBERO environment-specific parameters ---
+    task_suite_name: str = "libero_spatial"
+    """Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90"""
+    num_steps_wait: int = 10
+    """Number of steps to wait for objects to stabilize in sim."""
+    num_trials_per_task: int = 50
+    """Number of rollouts per task."""
+
+    # --- Evaluation arguments ---
+    video_out_path: str = "data/libero/videos"
+    """Path to save videos."""
+    device: str = "cuda"
+    """Device to use for evaluation."""
+
+    seed: int = 7
+    """Random Seed (for reproducibility)"""
+
+
+@draccus.wrap()
+def eval_libero(args: Args) -> None:
+    # Set random seed
+    torch.manual_seed(args.seed)
+    np.random.seed(args.seed)
+
+    # --- Load Policy ---
+    policy = SmolVLAPolicy.from_pretrained(args.policy_path)
+    policy.to(args.device)
+    policy.eval()
+
+    # --- Initialize LIBERO task suite ---
+    benchmark_dict = benchmark.get_benchmark_dict()
+    try:
+        task_suite = benchmark_dict[args.task_suite_name]()
+    except KeyError:
+        raise ValueError(
+            f"Unknown task suite: {args.task_suite_name}. "
+            f"Available options are: {list(benchmark_dict.keys())}"
+        )
+    num_tasks_in_suite = task_suite.n_tasks
+    logging.info(f"Task suite: {args.task_suite_name}")
+
+    pathlib.Path(args.video_out_path).mkdir(parents=True, exist_ok=True)
+
+    if args.task_suite_name == "libero_spatial":
+        max_steps = 220  # longest training demo has 193 steps
+    elif args.task_suite_name == "libero_object":
+        max_steps = 280  # longest training demo has 254 steps
+    elif args.task_suite_name == "libero_goal":
+        max_steps = 300  # longest training demo has 270 steps
+    elif args.task_suite_name == "libero_10":
+        max_steps = 520  # longest training demo has 505 steps
+    elif args.task_suite_name == "libero_90":
+        max_steps = 400  # longest training demo has 373 steps
+    else:
+        # Fallback for custom task suites
+        max_steps = 520
+
+    # --- Evaluation Loop ---
+    total_episodes, total_successes = 0, 0
+    for task_id in tqdm(range(num_tasks_in_suite), desc="Tasks"):
+        # Get task
+        task = task_suite.get_task(task_id)
+
+        # Get default LIBERO initial states
+        initial_states = task_suite.get_task_init_states(task_id)
+        
+        # Initialize LIBERO environment and task description
+        env, task_description = _get_libero_env(task, LIBERO_ENV_RESOLUTION, args.seed)
+
+        # Start episodes
+        task_episodes, task_successes = 0, 0
+        for episode_idx in tqdm(
+            range(min(args.num_trials_per_task, len(initial_states))),
+            desc=f"Task {task_id}: {task.language}",
+            leave=False,
+        ):
+            logging.info(f"\nTask: {task_description}")
+
+            # Reset environment and policy
+            env.reset()
+            policy.reset()
+
+            # Set initial states
+            obs = env.set_init_state(initial_states[episode_idx])
+
+            # IMPORTANT: Do nothing for the first few timesteps because the simulator drops objects
+            # and we need to wait for them to fall
+            for _ in range(args.num_steps_wait):
+                obs, _, _, _ = env.step(LIBERO_DUMMY_ACTION)
+
+            # Setup
+            t = 0
+            frames = []
+            done = False
+
+            # Add initial frame
+            agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1])
+            # frames.append(agentview_image)
+            # import ipdb; ipdb.set_trace()
+            logging.info(f"Starting episode {task_episodes+1}...")
+            while t < max_steps:
+                try:
+                    # Get preprocessed image
+                    # IMPORTANT: rotate 180 degrees to match train preprocessing
+                    wrist_img = np.ascontiguousarray(obs["robot0_eye_in_hand_image"][::-1, ::-1])
+                    agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1])
+                    frames.append(agentview_image)
+
+                    # Prepare observations dict
+                    state = np.concatenate(
+                        (
+                            obs["robot0_eef_pos"],
+                            _quat2axisangle(obs["robot0_eef_quat"]),
+                            obs["robot0_gripper_qpos"],
+                        )
+                    )
+                    observation = {
+                        "observation.images.image": torch.from_numpy(agentview_image / 255.0)
+                        .permute(2, 0, 1)
+                        .to(torch.float32)
+                        .to(args.device).unsqueeze(0),
+                        "observation.images.image2": torch.from_numpy(wrist_img / 255.0)
+                        .permute(2, 0, 1)
+                        .to(torch.float32)
+                        .to(args.device).unsqueeze(0),
+                        "observation.state": torch.from_numpy(state).to(torch.float32).to(args.device).unsqueeze(0),
+                        "task": task_description,
+                    }
+
+                    # Query model to get action
+                    with torch.inference_mode():
+                        action_tensor = policy.select_action(observation)
+                    action = action_tensor.cpu().numpy()[0]
+                    action[-1] = 1 -  action[-1]
+
+                    # Execute action in environment
+                    obs, _, done, _ = env.step(action)
+                    if done:
+                        task_successes += 1
+                        total_successes += 1
+                        break
+                    t += 1
+
+                except Exception as e:
+                    logging.error(f"Caught exception: {e}")
+                    break
+
+            task_episodes += 1
+            total_episodes += 1
+
+            # Save a replay video of the episode
+            suffix = "success" if done else "failure"
+            task_segment = task_description.replace(" ", "_").replace("/", "_")
+            video_path = (
+                pathlib.Path(args.video_out_path) / f"rollout_task_{task_id}_episode_{episode_idx}_{task_segment}_{suffix}.mp4"
+            )
+            fps = 30
+            writer = imageio.get_writer(video_path, fps=fps)
+
+            for image in frames:
+                writer.append_data(image)
+            writer.close()
+            logging.info(f"Saved video to {video_path}")
+
+            # Log current results
+            logging.info(f"Success: {done}")
+            if total_episodes > 0:
+                logging.info(f"# episodes completed so far: {total_episodes}")
+                logging.info(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)")
+
+        # Log final results for the task
+        if task_episodes > 0:
+            logging.info(f"Task {task_id} success rate: {float(task_successes) / float(task_episodes):.2f}")
+        if total_episodes > 0:
+            logging.info(f"Cumulative success rate: {float(total_successes) / float(total_episodes):.2f}")
+
+    logging.info("--- Evaluation finished ---")
+    if total_episodes > 0:
+        logging.info(f"Total success rate: {float(total_successes) / float(total_episodes):.2f}")
+    logging.info(f"Total episodes: {total_episodes}")
+    logging.info(f"Total successes: {total_successes}")
+    cv2.destroyAllWindows()
+
+
+def _get_libero_env(task, resolution, seed):
+    """Initializes and returns the LIBERO environment, along with the task description."""
+    task_description = task.language
+    task_bddl_file = pathlib.Path(get_libero_path("bddl_files")) / task.problem_folder / task.bddl_file
+    env_args = {
+        "bddl_file_name": str(task_bddl_file),
+        "camera_heights": resolution,
+        "camera_widths": resolution,
+    }
+    env = OffScreenRenderEnv(**env_args)
+    env.seed(seed)  # IMPORTANT: seed seems to affect object positions even when using fixed initial state
+    return env, task_description
+
+
+def _quat2axisangle(quat):
+    """
+    Copied from robosuite:
+    https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
+    """
+    # clip quaternion
+    if quat[3] > 1.0:
+        quat[3] = 1.0
+    elif quat[3] < -1.0:
+        quat[3] = -1.0
+
+    den = np.sqrt(1.0 - quat[3] * quat[3])
+    if math.isclose(den, 0.0):
+        # This is (close to) a zero degree rotation, immediately return
+        return np.zeros(3)
+
+    return (quat[:3] * 2.0 * math.acos(quat[3])) / den
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    eval_libero()
\ No newline at end of file
diff --git a/examples/requirements.in b/examples/requirements.in
new file mode 100644
index 000000000..25664608a
--- /dev/null
+++ b/examples/requirements.in
@@ -0,0 +1,8 @@
+imageio[ffmpeg]
+numpy==1.22.4
+tqdm
+tyro
+PyYaml
+opencv-python==4.6.0.66
+robosuite==1.4.1
+matplotlib==3.5.3
\ No newline at end of file
diff --git a/examples/script2.py b/examples/script2.py
new file mode 100644
index 000000000..cbd4da913
--- /dev/null
+++ b/examples/script2.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+import os
+import pyarrow.parquet as pq
+import tempfile
+import shutil
+
+FILES_TO_PATCH = [
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet",
+]
+
+# Column renaming map
+rename_map = {
+    "wrist_image": "observation.images.image2",
+    "actions": "action",
+}
+
+# Hugging Face metadata
+HF_METADATA = {
+    b"huggingface": b'{"info": {"features": {'
+    b'"observation.images.image": {"_type": "Image"}, '
+    b'"observation.images.image2": {"_type": "Image"}, '
+    b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
+    b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
+    b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
+    b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"index": {"dtype": "int64", "_type": "Value"}, '
+    b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
+}
+
+def patch_parquet(parquet_path, hf_metadata, rename_map):
+    try:
+        table = pq.read_table(parquet_path)
+
+        # Apply column renames if needed
+        if rename_map:
+            schema = table.schema
+            new_names = [rename_map.get(name, name) for name in schema.names]
+            table = table.rename_columns(new_names)
+
+        # Merge schema metadata
+        new_meta = dict(table.schema.metadata or {})
+        new_meta.update(hf_metadata)
+
+        # Replace metadata
+        table = table.replace_schema_metadata(new_meta)
+
+        # Write via temp file
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
+        os.close(tmp_fd)
+        pq.write_table(table, tmp_path)
+
+        shutil.move(tmp_path, parquet_path)
+        print(f"✅ Patched: {parquet_path}")
+        return True
+    except Exception as e:
+        print(f"❌ Failed on {parquet_path}: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    for fpath in FILES_TO_PATCH:
+        if os.path.exists(fpath):
+            patch_parquet(fpath, HF_METADATA, rename_map)
+        else:
+            print(f"⚠️ File not found: {fpath}")
diff --git a/examples/script3.py b/examples/script3.py
new file mode 100644
index 000000000..7b4d7957a
--- /dev/null
+++ b/examples/script3.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+import os
+import pyarrow.parquet as pq
+import tempfile
+import shutil
+
+# Root directory containing all parquet files
+ROOT_DIR = "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data"
+
+# Column renaming map (normalize schema to what training expects)
+rename_map = {
+    "state": "observation.state",
+}
+
+# Hugging Face metadata (aligned with expected feature names)
+HF_METADATA = {
+    b"huggingface": b'{"info": {"features": {'
+    b'"observation.images.image": {"_type": "Image"}, '
+    b'"observation.images.image2": {"_type": "Image"}, '
+    b'"observation.state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
+    b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
+    b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
+    b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"index": {"dtype": "int64", "_type": "Value"}, '
+    b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
+}
+
+def patch_parquet(parquet_path, hf_metadata, rename_map):
+    try:
+        # Read the parquet table
+        table = pq.read_table(parquet_path)
+
+        # Apply renames if necessary
+        if rename_map:
+            new_names = [rename_map.get(name, name) for name in table.schema.names]
+            if new_names != table.schema.names:
+                table = table.rename_columns(new_names)
+
+        # Update metadata
+        new_meta = dict(table.schema.metadata or {})
+        new_meta.update(hf_metadata)
+        table = table.replace_schema_metadata(new_meta)
+
+        # Write to temp file then atomically move back
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
+        os.close(tmp_fd)
+        pq.write_table(table, tmp_path)
+        shutil.move(tmp_path, parquet_path)
+
+        # Debug print
+        print(f"✅ Patched: {parquet_path}")
+        print("   Columns:", table.schema.names)
+        return True
+    except Exception as e:
+        print(f"❌ Failed on {parquet_path}: {e}")
+        return False
+
+if __name__ == "__main__":
+    for dirpath, _, filenames in os.walk(ROOT_DIR):
+        for fname in filenames:
+            if fname.endswith(".parquet"):
+                fpath = os.path.join(dirpath, fname)
+                patch_parquet(fpath, HF_METADATA, rename_map)
diff --git a/examples/script4.py b/examples/script4.py
new file mode 100644
index 000000000..2eed60a94
--- /dev/null
+++ b/examples/script4.py
@@ -0,0 +1,3 @@
+from huggingface_hub import HfApi
+hub_api = HfApi()
+hub_api.create_tag("HuggingFaceVLA/libero", tag="v2.1", repo_type="dataset")
diff --git a/log_text.txt b/log_text.txt
new file mode 100644
index 000000000..6676df0eb
--- /dev/null
+++ b/log_text.txt
@@ -0,0 +1,1765 @@
+    self.vlm_with_expert = SmolVLMWithExpertModel(
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/smolvlm_with_expert.py", line 88, in __init__
+    self.processor = AutoProcessor.from_pretrained(model_id)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/models/auto/processing
+_auto.py", line 300, in from_pretrained
+    config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/processing_utils.py",
+line 944, in get_processor_dict
+    resolved_raw_chat_template_file = cached_file(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 32
+1, in cached_file
+    file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 47
+8, in cached_files
+    hf_hub_download(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p
+y", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 1010, in hf_hub_download
+    return _hf_hub_download_to_cache_dir(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 1073, in _hf_hub_download_to_cache_dir
+    (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_err
+or(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 1546, in _get_metadata_or_catch_error
+    metadata = get_hf_file_metadata(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p
+y", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 1463, in get_hf_file_metadata
+    r = _request_wrapper(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 286, in _request_wrapper
+    response = _request_wrapper(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 309, in _request_wrapper
+    response = http_backoff(method=method, url=url, **params, retry_on_exceptions=(), retry_on_status_codes=(429,))
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li
+ne 310, in http_backoff
+    response = session.request(method=method, url=url, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 589, in
+ request
+    resp = self.send(prep, **send_kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 703, in
+ send
+    r = adapter.send(request, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li
+ne 96, in send
+    return super().send(request, *args, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/adapters.py", line 644, in
+ send
+    resp = conn.urlopen(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 78
+7, in urlopen
+    response = self._make_request(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 53
+4, in _make_request
+    response = conn.getresponse()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connection.py", line 565, i
+n getresponse
+    httplib_response = super().getresponse()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 1375, in getresponse
+    response.begin()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 318, in begin
+    version, status, reason = self._read_status()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 279, in _read_status
+    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/socket.py", line 717, in readinto
+    return self._sock.recv_into(b)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1307, in recv_into
+    return self.read(nbytes, buffer)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1163, in read
+    return self._sslobj.read(len, buffer)
+KeyboardInterrupt
+clea
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:50:52 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:50:52 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:50:52 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:50:52 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:50:52 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:50:52 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:50:52 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 67057.8
+5it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5343.9
+4it/s]
+INFO 2025-09-09 15:50:57 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 47393.2
+7it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3797.4
+7it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44384.1
+7it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 6533.1
+8it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 15:51:30 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 15:51:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 15:51:30 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 15:51:30 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 15:51:30 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 15:51:30 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 15:51:30 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 15:51:30 ts/train.py:187 num_total_params=399268924 (399M)
+INFO 2025-09-09 15:51:30 ts/train.py:225 Start offline training on a fixed dataset
+> /home/jade_choghari/lerobot/src/lerobot/scripts/train.py(230)train()
+-> train_tracker.dataloading_s = time.perf_counter() - start_time
+(Pdb) batch.keys()
+dict_keys(['image', 'wrist_image', 'state', 'actions', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_i
+ndex', 'task'])
+(Pdb) policy.config.input_features
+{'image': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type
+=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256))}
+(Pdb) quit()
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train
+    train_tracker.dataloading_s = time.perf_counter() - start_time
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train
+    train_tracker.dataloading_s = time.perf_counter() - start_time
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
+    return self.dispatch_line(frame)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
+    if self.quitting: raise BdbQuit
+bdb.BdbQuit
+clear
+^[[A(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:53:49 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:53:49 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:53:49 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:53:49 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:53:49 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:53:49 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:53:49 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 34701.4
+4it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5495.3
+7it/s]
+INFO 2025-09-09 15:53:55 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 41943.0
+4it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5500.7
+3it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2361.6
+6it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5041.2
+3it/s]
+Reducing the number of VLM layers to 16 ...
+> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy()
+-> assert isinstance(policy, nn.Module)
+(Pdb) features
+{'image': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type
+=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'actions': PolicyFeature(type=<FeatureType.ACTION: 'ACTION'>,
+ shape=(7,))}
+(Pdb) ds_meta.features
+{'image': {'dtype': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'wrist_image': {'dtyp
+e': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'state': {'dtype': 'float32', 'shape'
+: (8,), 'names': ['state']}, 'actions': {'dtype': 'float32', 'shape': (7,), 'names': ['actions']}, 'timestamp': {'dt
+ype': 'float32', 'shape': (1,), 'names': None}, 'frame_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'ep
+isode_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'index': {'dtype': 'int64', 'shape': (1,), 'names':
+None}, 'task_index': {'dtype': 'int64', 'shape': (1,), 'names': None}}
+(Pdb) quit()
+
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train
+    policy = make_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
+    assert isinstance(policy, nn.Module)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
+    assert isinstance(policy, nn.Module)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
+    return self.dispatch_line(frame)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
+    if self.quitting: raise BdbQuit
+bdb.BdbQuit
+clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:56:35 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:56:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:56:35 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:56:35 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:56:35 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:56:35 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:56:35 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 78132.9
+5it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 4716.0
+3it/s]
+INFO 2025-09-09 15:56:40 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5259.3
+2it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3477.8
+6it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 45343.8
+3it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5551.6
+9it/s]
+Reducing the number of VLM layers to 16 ...
+> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy()
+-> assert isinstance(policy, nn.Module)
+(Pdb) features
+{'image': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type
+=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'state': PolicyFeature(type=<FeatureType.STATE: 'STATE'>, sha
+pe=(8,)), 'actions': PolicyFeature(type=<FeatureType.ACTION: 'ACTION'>, shape=(7,))}
+(Pdb) quit()
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in <module>
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main
+
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train
+    policy = make_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
+    # policy = torch.compile(policy, mode="reduce-overhead")
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
+    # policy = torch.compile(policy, mode="reduce-overhead")
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
+    return self.dispatch_line(frame)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
+    if self.quitting: raise BdbQuit
+bdb.BdbQuit
+clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:58:35 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:58:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:58:35 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:58:35 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:58:35 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:58:35 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:58:35 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 27666.4
+6it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5305.7
+0it/s]
+INFO 2025-09-09 15:58:41 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44384.1
+7it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3192.0
+1it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44620.2
+6it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 42799.0
+2it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 15:59:13 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 15:59:13 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 15:59:13 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 15:59:13 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 15:59:13 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 15:59:13 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 15:59:13 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 15:59:13 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 15:59:13 ts/train.py:225 Start offline training on a fixed dataset
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
+    train_tracker, output_dict = update_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy
+    loss, output_dict = policy.forward(batch)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 458, in forward
+    actions = self.prepare_action(batch)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 580, in prepare_action
+    actions = pad_vector(batch[ACTION], self.config.max_action_dim)
+KeyError: 'action'
+Exception in thread Thread-3 (_pin_memory_loop):
+Traceback (most recent call last):
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
+    self.run()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 953, in run
+    self._target(*self._args, **self._kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.
+py", line 61, in _pin_memory_loop
+    do_one_step()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.
+py", line 37, in do_one_step
+    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/queues.py", line 122, in get
+    return _ForkingPickler.loads(res)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/multiprocessing/reductions.py
+", line 541, in rebuild_storage_fd
+    fd = df.detach()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in
+detach
+    with _resource_sharer.get_connection(self._id) as conn:
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in
+get_connection
+    c = Client(address, authkey=process.current_process().authkey)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 508, in Clie
+nt
+    answer_challenge(c, authkey)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 752, in answ
+er_challenge
+    message = connection.recv_bytes(256)         # reject large message
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 216, in recv
+_bytes
+    buf = self._recv_bytes(maxlength)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 414, in _rec
+v_bytes
+    buf = self._recv(4)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 379, in _rec
+v
+    chunk = read(handle, remaining)
+ConnectionResetError: [Errno 104] Connection reset by peer
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:59:53 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:59:53 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:59:53 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:59:53 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:59:53 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:59:53 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:59:53 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 72147.3
+3it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5076.7
+1it/s]
+INFO 2025-09-09 15:59:58 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 6096.3
+7it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 4348.6
+8it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 46091.2
+5it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3225.1
+5it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 16:00:31 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 16:00:31 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 16:00:31 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 16:00:31 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 16:00:31 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 16:00:31 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 16:00:31 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 16:00:31 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 16:00:31 ts/train.py:225 Start offline training on a fixed dataset
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
+    train_tracker, output_dict = update_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy
+    loss, output_dict = policy.forward(batch)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 461, in forward
+    losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 850, in forward
+    att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 226, in make_att_2d_mask
+s
+    att_2d_masks = att_2d_masks & pad_2d_masks
+RuntimeError: The size of tensor a (199) must match the size of tensor b (181) at non-singleton dimension 2
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 16:10:03 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 16:10:03 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 16:10:03 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 16:10:03 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 16:10:03 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 16:10:03 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 16:10:03 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 54574.89it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 70/70 [00:00<00:00, 7567.63it/s]
+INFO 2025-09-09 16:10:09 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 40721.40it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 7516.67it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 3158.36it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 6775.94it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 16:10:41 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 16:10:41 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 16:10:41 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 16:10:41 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 16:10:41 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 16:10:41 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 16:10:41 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 16:10:41 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 16:10:41 ts/train.py:225 Start offline training on a fixed dataset
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
+    train_tracker, output_dict = update_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy
+    grad_scaler.unscale_(optimizer)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342
+, in unscale_
+    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283
+, in _unscale_grads_
+    torch._amp_foreach_non_finite_check_and_unscale_(
+RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 16:12:28 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 16:12:28 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 16:12:28 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 16:12:28 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 16:12:28 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 16:12:28 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 16:12:28 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 87666.13it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 70/70 [00:00<00:00, 4223.20it/s]
+INFO 2025-09-09 16:12:34 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 43690.67it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 4871.43it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 6512.89it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 43018.50it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 16:13:06 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 16:13:06 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 16:13:06 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 16:13:06 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 16:13:06 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 16:13:06 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 16:13:06 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 16:13:06 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 16:13:06 ts/train.py:225 Start offline training on a fixed dataset
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
+    train_tracker, output_dict = update_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy
+    grad_scaler.unscale_(optimizer)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342
+, in unscale_
+    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283
+, in _unscale_grads_
+    torch._amp_foreach_non_finite_check_and_unscale_(
+RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 16:13:51 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 16:13:51 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 16:13:51 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': False,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 16:13:51 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 16:13:51 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 16:13:51 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 16:13:51 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 82981.28it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 70/70 [00:00<00:00, 4687.94it/s]
+INFO 2025-09-09 16:13:57 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 21345.06it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 4226.00it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 2966.27it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 6497.76it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 16:14:30 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 16:14:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 16:14:30 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 16:14:30 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 16:14:30 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 16:14:30 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 16:14:30 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 16:14:30 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 16:14:30 ts/train.py:225 Start offline training on a fixed dataset
+INFO 2025-09-09 16:16:20 ts/train.py:255 step:200 smpl:6K ep:40 epch:0.02 loss:1.244 grdn:2.492 lr:1.0e-05 updt_s:0.
+536 data_s:0.007
+INFO 2025-09-09 16:17:56 ts/train.py:255 step:400 smpl:13K ep:79 epch:0.05 loss:0.685 grdn:4.262 lr:3.0e-05 updt_s:0
+.481 data_s:0.000
+INFO 2025-09-09 16:19:33 ts/train.py:255 step:600 smpl:19K ep:119 epch:0.07 loss:0.364 grdn:4.849 lr:5.0e-05 updt_s:
+0.482 data_s:0.000
+INFO 2025-09-09 16:21:10 ts/train.py:255 step:800 smpl:26K ep:158 epch:0.09 loss:0.239 grdn:4.024 lr:7.0e-05 updt_s:
+0.481 data_s:0.000
+INFO 2025-09-09 16:22:46 ts/train.py:255 step:1K smpl:32K ep:198 epch:0.12 loss:0.197 grdn:3.267 lr:9.0e-05 updt_s:0
+.478 data_s:0.000
+INFO 2025-09-09 16:24:22 ts/train.py:255 step:1K smpl:38K ep:238 epch:0.14 loss:0.173 grdn:2.319 lr:1.0e-04 updt_s:0
+.481 data_s:0.000
+INFO 2025-09-09 16:25:59 ts/train.py:255 step:1K smpl:45K ep:277 epch:0.16 loss:0.153 grdn:1.741 lr:1.0e-04 updt_s:0
+.483 data_s:0.000
+INFO 2025-09-09 16:27:36 ts/train.py:255 step:2K smpl:51K ep:317 epch:0.19 loss:0.135 grdn:1.354 lr:9.9e-05 updt_s:0
+.483 data_s:0.000
+INFO 2025-09-09 16:29:14 ts/train.py:255 step:2K smpl:58K ep:357 epch:0.21 loss:0.126 grdn:1.177 lr:9.9e-05 updt_s:0
+.484 data_s:0.000
+
diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py
index f5fa727cf..75863d3fc 100644
--- a/src/lerobot/configs/policies.py
+++ b/src/lerobot/configs/policies.py
@@ -62,6 +62,7 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
     # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
     # automatic gradient scaling is used.
     use_amp: bool = False
+    gradient_accumulation_steps: int = 1
 
     push_to_hub: bool = True
     repo_id: str | None = None
diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index a869cb920..6509993bb 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -472,7 +472,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
             episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes]
             self.stats = aggregate_stats(episodes_stats)
 
-        # Load actual data
         try:
             if force_cache_sync:
                 raise FileNotFoundError
@@ -598,6 +597,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         """hf_dataset contains all the observations, states, actions, rewards, etc."""
         if self.episodes is None:
             path = str(self.root / "data")
+            # added by jade
             hf_dataset = load_dataset("parquet", data_dir=path, split="train")
         else:
             files = [str(self.root / self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes]
diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py
index 078c5351d..daa1de163 100644
--- a/src/lerobot/datasets/utils.py
+++ b/src/lerobot/datasets/utils.py
@@ -455,7 +455,8 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea
                 shape = (shape[2], shape[0], shape[1])
         elif key == "observation.environment_state":
             type = FeatureType.ENV
-        elif key.startswith("observation"):
+        # changed by jade
+        elif key.startswith("observation") or key.startswith("state"):
             type = FeatureType.STATE
         elif key.startswith("action"):
             type = FeatureType.ACTION
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
index 4b8eeffd1..79461d3a9 100644
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -34,6 +34,7 @@ from lerobot.policies.sac.reward_model.configuration_classifier import RewardCla
 from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
 from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
 from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
+from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
 
 
 def get_policy_class(name: str) -> PreTrainedPolicy:
@@ -74,6 +75,10 @@ def get_policy_class(name: str) -> PreTrainedPolicy:
         from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
 
         return SmolVLAPolicy
+    elif name == "smolpi0":
+        from lerobot.policies.smolpi0.modeling_smolpi0 import SMOLPI0Policy
+
+        return SMOLPI0Policy
     else:
         raise NotImplementedError(f"Policy with name {name} is not implemented.")
 
@@ -97,6 +102,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
         return SmolVLAConfig(**kwargs)
     elif policy_type == "reward_classifier":
         return RewardClassifierConfig(**kwargs)
+    elif policy_type == "smolpi0":
+        return SMOLPI0Config(**kwargs)
     else:
         raise ValueError(f"Policy type '{policy_type}' is not available.")
 
@@ -170,7 +177,6 @@ def make_policy(
         policy = policy_cls(**kwargs)
     policy.to(cfg.device)
     assert isinstance(policy, nn.Module)
-
     # policy = torch.compile(policy, mode="reduce-overhead")
 
     return policy
diff --git a/src/lerobot/policies/normalize.py b/src/lerobot/policies/normalize.py
index 119055873..043265b1b 100644
--- a/src/lerobot/policies/normalize.py
+++ b/src/lerobot/policies/normalize.py
@@ -255,6 +255,83 @@ class Unnormalize(nn.Module):
         return batch
 
 
+class NormalizePerRobotType(nn.Module):
+    """Normalizes data (e.g. "observation.image") for more stable and faster convergence during training."""
+
+    def __init__(
+        self,
+        features: dict[str, PolicyFeature],
+        norm_map: dict[str, NormalizationMode],
+        stats: dict[str, dict[str, Tensor]] | None = None,
+    ):
+        """
+        Args:
+            shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values
+            are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing
+            mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape
+            is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format.
+            modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values
+                are their normalization modes among:
+                    - "mean_std": subtract the mean and divide by standard deviation.
+                    - "min_max": map to [-1, 1] range.
+            stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image")
+                and values are dictionaries of statistic types and their values (e.g.
+                `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for
+                training the model for the first time, these statistics will overwrite the default buffers. If
+                not provided, as expected for finetuning or evaluation, the default buffers should to be
+                overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the
+                dataset is not needed to get the stats, since they are already in the policy state_dict.
+        """
+        super().__init__()
+        self.features = features
+        self.norm_map = norm_map
+        for robot_type in stats.keys():
+            stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type])
+            for key, buffer in stats_buffers.items():
+                setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer)
+
+    # TODO(rcadene): should we remove torch.no_grad?
+    @torch.no_grad
+    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+        batch = dict(batch)  # shallow copy avoids mutating the input batch
+        assert "robot_type" in batch, "robot_type is not in the batch"
+        robot_types = batch["robot_type"]
+
+        for key, ft in self.features.items():
+            if key not in batch:
+                continue
+
+            norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY)
+            if norm_mode is NormalizationMode.IDENTITY:
+                continue
+            # FIXME(mshukor): make it more efficient
+            buffers = [
+                getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types
+            ]
+            if norm_mode is NormalizationMode.MEAN_STD:
+                mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))],dim=0)
+                std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))],dim=0)
+                if batch[key].ndim == 3:
+                    mean = mean.unsqueeze(1)
+                    std = std.unsqueeze(1)
+                assert not torch.isinf(mean).any(), _no_stats_error_str("mean")
+                assert not torch.isinf(std).any(), _no_stats_error_str("std")
+                batch[key] = (batch[key] - mean) / (std + 1e-8)
+            elif norm_mode is NormalizationMode.MIN_MAX:
+                min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0)
+                max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0)
+                assert not torch.isinf(min).any(), _no_stats_error_str("min")
+                assert not torch.isinf(max).any(), _no_stats_error_str("max")
+                if batch[key].ndim == 3:
+                    min = min.unsqueeze(1)
+                    max = max.unsqueeze(1)
+                # normalize to [0,1]
+                batch[key] = (batch[key] - min) / (max - min + 1e-8)
+                # normalize to [-1, 1]
+                batch[key] = batch[key] * 2 - 1
+            else:
+                raise ValueError(norm_mode)
+        return batch
 # TODO (azouitine): We should replace all normalization on the policies with register_buffer normalization
 #       and remove the `Normalize` and `Unnormalize` classes.
 def _initialize_stats_buffers(
@@ -418,3 +495,87 @@ class UnnormalizeBuffer(nn.Module):
             raise ValueError(norm_mode)
 
         return batch
+
+
+class UnnormalizePerRobotType(nn.Module):
+    """
+    Similar to `Normalize` but unnormalizes output data (e.g. `{"action": torch.randn(b,c)}`) in their
+    original range used by the environment.
+    """
+
+    def __init__(
+        self,
+        features: dict[str, PolicyFeature],
+        norm_map: dict[str, NormalizationMode],
+        stats: dict[str, dict[str, Tensor]] | None = None,
+    ):
+        """
+        Args:
+            shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values
+            are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing
+            mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape
+            is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format.
+            modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values
+                are their normalization modes among:
+                    - "mean_std": subtract the mean and divide by standard deviation.
+                    - "min_max": map to [-1, 1] range.
+            stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image")
+                and values are dictionaries of statistic types and their values (e.g.
+                `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for
+                training the model for the first time, these statistics will overwrite the default buffers. If
+                not provided, as expected for finetuning or evaluation, the default buffers should to be
+                overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the
+                dataset is not needed to get the stats, since they are already in the policy state_dict.
+        """
+        super().__init__()
+        self.features = features
+        self.norm_map = norm_map
+        self.stats = stats
+        # `self.buffer_observation_state["mean"]` contains `torch.tensor(state_dim)`
+        for robot_type in stats.keys():
+            stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type])
+            for key, buffer in stats_buffers.items():
+                setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer)
+
+    # TODO(rcadene): should we remove torch.no_grad?
+    @torch.no_grad
+    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+        batch = dict(batch)  # shallow copy avoids mutating the input batch
+        assert "robot_type" in batch, "robot_type is not in the batch"
+        robot_types = batch["robot_type"]
+
+        for key, ft in self.features.items():
+            if key not in batch:
+                continue
+
+            norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY)
+            if norm_mode is NormalizationMode.IDENTITY:
+                continue
+
+            # buffer = getattr(self, "buffer_" + key.replace(".", "_"))
+            buffers = [
+                getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types
+            ]
+
+            if norm_mode is NormalizationMode.MEAN_STD:
+                mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0)
+                std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0)
+                assert not torch.isinf(mean).any(), _no_stats_error_str("mean")
+                assert not torch.isinf(std).any(), _no_stats_error_str("std")
+                if batch[key].ndim == 3:
+                    mean = mean.unsqueeze(1)
+                    std = std.unsqueeze(1)
+                batch[key] = batch[key] * std + mean
+            elif norm_mode is NormalizationMode.MIN_MAX:
+                min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0)
+                max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0)
+                assert not torch.isinf(min).any(), _no_stats_error_str("min")
+                assert not torch.isinf(max).any(), _no_stats_error_str("max")
+                if batch[key].ndim == 3:
+                    min = min.unsqueeze(1)
+                    max = max.unsqueeze(1)
+                batch[key] = (batch[key] + 1) / 2
+                batch[key] = batch[key] * (max - min) + min
+            else:
+                raise ValueError(norm_mode)
+        return batch
diff --git a/src/lerobot/policies/smolpi0/configuration_smolpi0.py b/src/lerobot/policies/smolpi0/configuration_smolpi0.py
new file mode 100644
index 000000000..c3605cd82
--- /dev/null
+++ b/src/lerobot/policies/smolpi0/configuration_smolpi0.py
@@ -0,0 +1,210 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from lerobot.optim.optimizers import AdamWConfig
+from lerobot.optim.schedulers import (
+    CosineDecayWithWarmupSchedulerConfig,
+)
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
+
+
+@dataclass
+class PEFTConfig:
+    r: int = 4
+    lora_alpha: int = 16
+    lora_dropout: float = 0.1
+    target_modules: str = "q_proj,v_proj"
+
+
+@PreTrainedConfig.register_subclass("smolpi0")
+@dataclass
+class SMOLPI0Config(PreTrainedConfig):
+    # Input / output structure.
+    n_obs_steps: int = 1
+    chunk_size: int = 50
+    n_action_steps: int = 50
+    n_obs_gap: int = 1
+
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "STATE": NormalizationMode.MEAN_STD,
+            "ACTION": NormalizationMode.MEAN_STD,
+        }
+    )
+
+    # Shorter state and action vectors will be padded
+    max_state_dim: int = 32
+    max_action_dim: int = 32
+
+    # Image preprocessing
+    resize_imgs_with_padding: tuple[int, int] = (512, 512) #(224, 224)
+
+    # Add empty images. Used by pi0_aloha_sim which adds the empty
+    # left and right wrist cameras in addition to the top camera.
+    empty_cameras: int = 0
+
+    # Converts the joint and gripper values from the standard Aloha space to
+    # the space used by the pi internal runtime which was used to train the base model.
+    adapt_to_pi_aloha: bool = False
+
+    # Converts joint dimensions to deltas with respect to the current state before passing to the model.
+    # Gripper dimensions will remain in absolute values.
+    use_delta_joint_actions_aloha: bool = False
+
+    # Tokenizer
+    tokenizer_max_length: int = 48
+
+    # Projector
+    proj_width: int = 480
+
+    # Decoding
+    num_steps: int = 10
+
+    # Attention utils
+    use_cache: bool = True
+    attention_implementation: str = "eager"  # or fa2, flex
+
+    # Finetuning settings
+    freeze_vision_encoder: bool = True
+    train_expert_only: bool = False
+    train_state_proj: bool = True
+
+    # Training presets
+    optimizer_lr: float = 2.5e-5
+    optimizer_betas: tuple[float, float] = (0.9, 0.95)
+    optimizer_eps: float = 1e-8
+    optimizer_weight_decay: float = 1e-10
+    optimizer_grad_clip_norm: float = 10
+    optimizer_lr_vlm: float = 0
+
+    scheduler_warmup_steps: int = 1_000
+    scheduler_decay_steps: int = 30_000
+    scheduler_decay_lr: float = 2.5e-6
+
+    # TODO: Add EMA
+    vlm_model_name: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+    checkpoint_path: str = None
+    load_vlm_weights: bool = False
+
+    peft_method: str = ""
+    peft_config: PEFTConfig = PEFTConfig()
+    peft_target_model: str = ""
+
+    add_image_special_tokens: bool = False
+    add_prompt_template: bool = False
+    prefix_prompt_template: str = f"<|im_start|>User: What action should the robot take to"
+    suffix_prompt_template: str = f"?\nAssistant:"
+
+    attention_mode: str = "self_attn"
+
+    prefix_length: int = -1 # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length
+
+    past_obs_keys: str = f"image"
+
+    add_local_special_image_tokens: bool = False
+
+    reverse_images_order: bool = False
+
+    state_to_prefix: bool = False
+
+    pad_language_to: str = "longest" # "max_length"
+
+    num_expert_layers: int = -1
+    num_vlm_layers: int = -1
+
+    causal_action_attention_mask: bool = False
+
+    self_attn_every_n_layers: int = -1
+
+    expert_width_multiplier: float = 0.5
+
+    robot_type: str = ""
+
+    self_attn_only_actions: bool = False
+
+    causal_attention_on_history: bool = False
+
+    predict_relative_actions: bool = False
+    relative_actions_mode: str = "first"
+
+    shuffle_camera_positions: bool = False
+    vlm_img_size: int = -1
+    
+    regression_loss: bool = False
+    
+    def __post_init__(self):
+        super().__post_init__()
+        if self.vlm_img_size > 0:
+            self.resize_imgs_with_padding = (self.vlm_img_size, self.vlm_img_size)
+        """Input validation (not exhaustive)."""
+        if self.n_action_steps > self.chunk_size:
+            raise ValueError(
+                f"The chunk size is the upper bound for the number of action steps per model invocation. Got "
+                f"{self.n_action_steps} for `n_action_steps` and {self.chunk_size} for `chunk_size`."
+            )
+        # if self.n_obs_steps != 1:
+        #     raise ValueError(
+        #         f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`"
+        #     )
+
+        if self.use_delta_joint_actions_aloha:
+            raise NotImplementedError(
+                "`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot."
+            )
+
+    def validate_features(self) -> None:
+        # TODO: implement value error
+        # if not self.image_features and not self.env_state_feature:
+        #     raise ValueError("You must provide at least one image or the environment state among the inputs.")
+
+        for i in range(self.empty_cameras):
+            key = f"observation.images.empty_camera_{i}"
+            empty_camera = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(3, 480, 640),
+            )
+            self.input_features[key] = empty_camera
+
+    def get_optimizer_preset(self) -> AdamWConfig:
+        return AdamWConfig(
+            lr=self.optimizer_lr,
+            betas=self.optimizer_betas,
+            eps=self.optimizer_eps,
+            weight_decay=self.optimizer_weight_decay,
+            grad_clip_norm=self.optimizer_grad_clip_norm,
+        )
+
+    def get_scheduler_preset(self):
+        return CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.optimizer_lr,
+            decay_lr=self.scheduler_decay_lr,
+            num_warmup_steps=self.scheduler_warmup_steps,
+            num_decay_steps=self.scheduler_decay_steps,
+        )
+
+    @property
+    def observation_delta_indices(self) -> list: # FIXME(mshukor): support spacing between observations
+        return [-k for k in range(0, self.n_obs_steps * self.n_obs_gap, self.n_obs_gap)][::-1]
+
+    @property
+    def action_delta_indices(self) -> list:
+        return list(range(self.chunk_size))
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
diff --git a/src/lerobot/policies/smolpi0/flex_attention.py b/src/lerobot/policies/smolpi0/flex_attention.py
new file mode 100644
index 000000000..13950f743
--- /dev/null
+++ b/src/lerobot/policies/smolpi0/flex_attention.py
@@ -0,0 +1,145 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F  # noqa: N812
+from packaging.version import Version
+
+if Version(torch.__version__) > Version("2.5.0"):
+    # Ffex attention is only available from torch 2.5 onwards
+    from torch.nn.attention.flex_attention import (
+        _mask_mod_signature,
+        _round_up_to_multiple,
+        create_block_mask,
+        create_mask,
+        flex_attention,
+    )
+
+
+@torch.compile(dynamic=False)
+def flex_attention_forward(
+    attention_mask: torch.Tensor,
+    batch_size: int,
+    head_dim: int,
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    scaling=None,
+    num_att_heads: int = 8,
+    num_key_value_heads: int = 1,
+):
+    """
+    This is defined out of classes to make compile happy.
+    """
+
+    original_dtype = query_states.dtype
+    num_key_value_groups = num_att_heads // num_key_value_heads
+    key_states = key_states[:, :, :, None, :]
+    key_states = key_states.expand(
+        batch_size, key_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim
+    )
+    key_states = key_states.reshape(
+        batch_size, key_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim
+    )
+
+    value_states = value_states[:, :, :, None, :]
+    value_states = value_states.expand(
+        batch_size, value_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim
+    )
+    value_states = value_states.reshape(
+        batch_size, value_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim
+    )
+
+    query_states = query_states.transpose(1, 2)
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
+
+    # query_states = query_states.to(torch.float32)
+    # key_states = key_states.to(torch.float32)
+    # value_states = value_states.to(torch.float32)
+
+    causal_mask = attention_mask
+    if causal_mask is not None:
+        causal_mask = causal_mask[:, None, :, : key_states.shape[2]]
+
+        if causal_mask.shape[1] == 1 and query_states.shape[1] > 1:
+            causal_mask = causal_mask.expand(-1, query_states.shape[1], -1, -1)
+
+    def precomputed_mask_factory(precomputed_mask: torch.Tensor) -> _mask_mod_signature:
+        def mask_mod(b, h, q_idx, kv_idx):
+            # Danger zone: if b,h,q_idx,kv_idx exceed the shape, device-side assert occurs.
+            return precomputed_mask[b][h][q_idx][kv_idx]
+
+        return mask_mod
+
+    b_mask, h_mask, q_len, kv_len = causal_mask.shape  # The shape of your mask
+
+    block_size = 128 # limitation of flex attention
+    q_len_rounded = _round_up_to_multiple(q_len, block_size)
+    kv_len_rounded = _round_up_to_multiple(kv_len, block_size)
+
+    # *CRITICAL* we do need to expand here, else we get a CUDA index error
+
+    pad_q = q_len_rounded - q_len
+    pad_k = kv_len_rounded - kv_len
+    if pad_q > 0 or pad_k > 0:
+        padded_causal_mask = F.pad(causal_mask, (0, pad_k, 0, pad_q), value=0.0)
+    else: 
+        padded_causal_mask = causal_mask
+    mask_mod_fn_orig = precomputed_mask_factory(padded_causal_mask)
+
+    mask_4d = create_mask(
+        mod_fn=mask_mod_fn_orig,
+        B=b_mask,
+        H=h_mask,
+        Q_LEN=q_len_rounded,
+        KV_LEN=kv_len_rounded,
+        device=causal_mask.device,
+    )
+    
+    mask_mod_fn_padded = precomputed_mask_factory(mask_4d)
+    # FIXME(mshukor): compile mask torch.compile(create_block_mask)
+    create_block_mask_compiled = torch.compile(create_block_mask)
+    block_mask = create_block_mask_compiled(
+        mask_mod=mask_mod_fn_padded,
+        B=b_mask,
+        H=None, # 
+        Q_LEN=q_len_rounded,
+        KV_LEN=kv_len_rounded,
+        BLOCK_SIZE=block_size,
+        device=causal_mask.device,
+        _compile=False,
+    )
+    padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0  else query_states
+    padded_key_states = F.pad(key_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else key_states
+    padded_value_states = F.pad(value_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else value_states
+    #  mask is applied inside the kernel, ideally more efficiently than score_mod.
+    attn_output, attention_weights = flex_attention(
+        padded_query_states,
+        padded_key_states,
+        padded_value_states,
+        block_mask=block_mask,
+        enable_gqa=True,  # because we shaped query/key states for GQA
+        scale=head_dim**-0.5 if scaling is None else scaling,
+        return_lse=True,
+    )
+
+    attn_output = attn_output.to(dtype=original_dtype)
+    attn_output = attn_output.transpose(1, 2).contiguous()  # [B, Q_LEN, H, head_dim]
+    attn_output = attn_output.reshape(
+        batch_size,
+        -1,
+        attn_output.shape[2] * attn_output.shape[3],  # merges [H, head_dim]
+    )
+    return attn_output[:, :-pad_k, :] if pad_k > 0 else attn_output
diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py
new file mode 100644
index 000000000..fa2d3d5a7
--- /dev/null
+++ b/src/lerobot/policies/smolpi0/modeling_smolpi0.py
@@ -0,0 +1,1021 @@
+#!/usr/bin/env python
+
+# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+π0: A Vision-Language-Action Flow Model for General Robot Control
+
+[Paper](https://www.physicalintelligence.company/download/pi0.pdf)
+[Jax code](https://github.com/Physical-Intelligence/openpi)
+
+Designed by Physical Intelligence. Ported from Jax by Hugging Face.
+
+Install pi0 extra dependencies:
+```bash
+pip install -e ".[pi0]"
+```
+
+Example of finetuning the pi0 pretrained model (`pi0_base` in `openpi`):
+```bash
+python lerobot/scripts/train.py \
+--policy.path=lerobot/pi0 \
+--dataset.repo_id=danaaubakirova/koch_test
+```
+
+Example of finetuning the pi0 neural network with PaliGemma and expert Gemma
+pretrained with VLM default parameters before pi0 finetuning:
+```bash
+python lerobot/scripts/train.py \
+--policy.type=pi0 \
+--dataset.repo_id=danaaubakirova/koch_test
+```
+
+Example of using the pi0 pretrained model outside LeRobot training framework:
+```python
+policy = Pi0Policy.from_pretrained("lerobot/pi0")
+```
+
+"""
+
+import math
+from collections import deque
+
+import torch
+import torch.nn.functional as F  # noqa: N812
+from torch import Tensor, nn
+from transformers import AutoProcessor
+
+from lerobot.constants import ACTION, OBS_STATE
+from lerobot.policies.normalize import (
+    Normalize,
+    NormalizePerRobotType,
+    Unnormalize,
+    UnnormalizePerRobotType,
+)
+from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
+from lerobot.policies.smolpi0.smolvlm_with_expert import (
+    SmolVLMWithExpertModel
+)
+from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.utils.utils import get_safe_dtype
+OBS_IMAGE = "observation.image"
+OBS_IMAGES = "observation.images"
+ACTION = "action"
+OBS_IMAGE_2 = "observation.image2"
+OBS_IMAGE_3 = "observation.image3"
+OBS_IMAGE_4 = "observation.image4"
+TASK = "task"
+ROBOT = "robot_type"
+IMAGES_ORDER = {
+    OBS_IMAGE: 0,
+    OBS_IMAGE_2: 1,
+    OBS_IMAGE_3: 2,
+    OBS_IMAGE_4: 3,
+}
+from lerobot.policies.utils import (
+    populate_queues,
+)
+import random
+def create_sinusoidal_pos_embedding(
+    time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
+) -> Tensor:
+    """Computes sine-cosine positional embedding vectors for scalar positions."""
+    if dimension % 2 != 0:
+        raise ValueError(f"dimension ({dimension}) must be divisible by 2")
+
+    if time.ndim != 1:
+        raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
+
+    dtype = get_safe_dtype(torch.float64, device.type)
+    fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
+    period = min_period * (max_period / min_period) ** fraction
+
+    # Compute the outer product
+    scaling_factor = 1.0 / period * 2 * math.pi
+    sin_input = scaling_factor[None, :] * time[:, None]
+    pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
+    return pos_emb
+
+
+def sample_beta(alpha, beta, bsize, device):
+    gamma1 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / alpha)
+    gamma2 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / beta)
+    return gamma1 / (gamma1 + gamma2)
+
+
+def make_att_2d_masks(pad_masks, att_masks):
+    """Copied from big_vision.
+
+    Tokens can attend to valid inputs tokens which have a cumulative mask_ar
+    smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to
+    setup several types of attention, for example:
+
+      [[1 1 1 1 1 1]]: pure causal attention.
+
+      [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
+          themselves and the last 3 tokens have a causal attention. The first
+          entry could also be a 1 without changing behaviour.
+
+      [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
+          block can attend all previous blocks and all tokens on the same block.
+
+    Args:
+      input_mask: bool[B, N] true if its part of the input, false if padding.
+      mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on
+        it and 0 where it shares the same attention mask as the previous token.
+    """
+    if att_masks.ndim != 2:
+        raise ValueError(att_masks.ndim)
+    if pad_masks.ndim != 2:
+        raise ValueError(pad_masks.ndim)
+
+    cumsum = torch.cumsum(att_masks, dim=1)
+    att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
+    pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
+    att_2d_masks = att_2d_masks & pad_2d_masks
+    return att_2d_masks
+
+
+def resize_with_pad(img, width, height, pad_value=-1):
+    # assume no-op when width height fits already
+    if img.ndim != 4:
+        raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
+
+    cur_height, cur_width = img.shape[2:]
+
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_img = F.interpolate(
+        img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+    )
+
+    pad_height = max(0, int(height - resized_height))
+    pad_width = max(0, int(width - resized_width))
+
+    # pad on left and top of image
+    padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
+    return padded_img
+
+
+def pad_vector(vector, new_dim):
+    """Can be (batch_size x sequence_length x features_dimension)
+    or (batch_size x features_dimension)
+    """
+    if vector.shape[-1] == new_dim:
+        return vector
+    shape = list(vector.shape)
+    current_dim = shape[-1]
+    shape[-1] = new_dim
+    new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device)
+    new_vector[..., :current_dim] = vector
+    return new_vector
+
+
+def normalize(x, min_val, max_val):
+    return (x - min_val) / (max_val - min_val)
+
+
+def unnormalize(x, min_val, max_val):
+    return x * (max_val - min_val) + min_val
+
+
+def safe_arcsin(value):
+    # This ensures that the input stays within
+    # [−1,1] to avoid invalid values for arcsin
+    return torch.arcsin(torch.clamp(value, -1.0, 1.0))
+
+
+def aloha_gripper_to_angular(value):
+    # Aloha transforms the gripper positions into a linear space. The following code
+    # reverses this transformation to be consistent with pi0 which is pretrained in
+    # angular space.
+    #
+    # These values are coming from the Aloha code:
+    # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED
+    value = unnormalize(value, min_val=0.01844, max_val=0.05800)
+
+    # This is the inverse of the angular to linear transformation inside the Interbotix code.
+    def linear_to_radian(linear_position, arm_length, horn_radius):
+        value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position)
+        return safe_arcsin(value)
+
+    # The constants are taken from the Interbotix code.
+    value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022)
+
+    # Normalize to [0, 1].
+    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+    return normalize(value, min_val=0.4, max_val=1.5)
+
+
+def aloha_gripper_from_angular(value):
+    # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha.
+    # Note that the units are still angular but the range is different.
+
+    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+    value = unnormalize(value, min_val=0.4, max_val=1.5)
+
+    # These values are coming from the Aloha code:
+    # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
+    return normalize(value, min_val=-0.6213, max_val=1.4910)
+
+
+def aloha_gripper_from_angular_inv(value):
+    # Directly inverts the gripper_from_angular function.
+    value = unnormalize(value, min_val=-0.6213, max_val=1.4910)
+    return normalize(value, min_val=0.4, max_val=1.5)
+
+class SMOLPI0Policy(PreTrainedPolicy):
+    """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot."""
+
+    config_class = SMOLPI0Config
+    name = "smolpi0"
+
+    def __init__(
+        self,
+        config: SMOLPI0Config,
+        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
+    ):
+        """
+        Args:
+            config: Policy configuration class instance or None, in which case the default instantiation of
+                    the configuration class is used.
+            dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
+                that they will be passed with a call to `load_state_dict` before the policy is used.
+        """
+
+        super().__init__(config)
+        config.validate_features()
+        self.config = config
+        self.normalize_per_robot_type = getattr(
+            config, "normalize_per_robot_type", False
+        )  # FIXME(mshukor): assert in case of single dataset
+        if self.normalize_per_robot_type:
+            if not dataset_stats:
+                dataset_stats[config.robot_type] = {}
+            self.normalize_inputs = NormalizePerRobotType(
+                config.input_features, config.normalization_mapping, dataset_stats
+            )
+            self.normalize_targets = NormalizePerRobotType(
+                config.output_features, config.normalization_mapping, dataset_stats
+            )
+            self.unnormalize_outputs = UnnormalizePerRobotType(
+                config.output_features, config.normalization_mapping, dataset_stats
+            )
+        else:
+            self.normalize_inputs = Normalize(
+                config.input_features, config.normalization_mapping, dataset_stats
+            )
+            self.normalize_targets = Normalize(
+                config.output_features, config.normalization_mapping, dataset_stats
+            )
+            self.unnormalize_outputs = Unnormalize(
+                config.output_features, config.normalization_mapping, dataset_stats
+            )
+
+        self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer
+        self.model = VLAFlowMatching(config)
+        self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split(",")
+        self.include_past_images = config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",")
+        self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1
+        self.reset()
+
+    def reset(self):
+        """This should be called whenever the environment is reset."""
+        # self._action_queue = deque([], maxlen=self.config.n_action_steps)
+        self._queues = {
+            ACTION: deque(maxlen=self.config.n_action_steps),
+        }
+        if self.config.n_obs_steps > 1:
+            for k in self.config.input_features:
+                if any([past_obs_key in k for past_obs_key in self.config.past_obs_keys.split(",")]):
+                    self._queues[k] = deque(maxlen=self.config.n_obs_steps)
+    
+    def get_optim_params(self) -> dict:
+        if self.config.optimizer_lr_vlm > 0 and self.config.optimizer_lr_vlm != self.config.optimizer_lr:
+            params = [
+                {
+                    "params": [
+                        p
+                        for n, p in self.named_parameters()
+                        if not ".vlm." in n and p.requires_grad
+                    ]
+                },
+                {
+                    "params": [
+                        p
+                        for n, p in self.named_parameters()
+                        if ".vlm." in n and p.requires_grad
+                    ],
+                    "lr": self.config.optimizer_lr_vlm,
+                },
+            ]
+            return params
+            
+        else:
+            return self.parameters()
+            
+
+    def merge_peft_model_weights(self) -> None:
+        if "lora" in self.config.peft_method:
+            self.model.vlm_with_expert.merge_lora_weights()
+
+    @torch.no_grad
+    def select_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+        """Select a single action given environment observations.
+
+        This method wraps `select_actions` in order to return one action at a time for execution in the
+        environment. It works by managing the actions in a queue and only calling `select_actions` when the
+        queue is empty.
+        """
+        self.eval()
+
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+
+        batch = self.normalize_inputs(batch)
+
+        images, img_masks = self.prepare_images(batch)
+        state = self.prepare_state(batch)
+        lang_tokens, lang_masks = self.prepare_language(batch)
+
+        actions = self.model.sample_actions(
+            images, img_masks, lang_tokens, lang_masks, state, noise=noise
+        )
+        # Unpad actions
+        original_action_dim = self.config.action_feature.shape[0]
+        actions = actions[:, :, :original_action_dim]
+
+        actions = self.unnormalize_outputs({"action": actions, "robot_type": batch["robot_type"]})["action"]
+
+        if self.config.adapt_to_pi_aloha:
+            actions = self._pi_aloha_encode_actions(actions)
+
+        return actions
+    
+    @torch.no_grad
+    def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+        """Select a single action given environment observations.
+
+        This method wraps `select_actions` in order to return one action at a time for execution in the
+        environment. It works by managing the actions in a queue and only calling `select_actions` when the
+        queue is empty.
+        """
+        self.eval()
+
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+
+        batch = self.normalize_inputs(batch)
+
+        self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
+        # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
+        # querying the policy.
+        if len(self._queues[ACTION]) == 0:
+            for k in batch:
+                if k in self._queues:
+                    batch[k] = torch.stack(list(self._queues[k]), dim=1)
+            images, img_masks = self.prepare_images(batch)
+            state = self.prepare_state(batch)
+            lang_tokens, lang_masks = self.prepare_language(batch)
+            actions = self.model.sample_actions(
+                images, img_masks, lang_tokens, lang_masks, state, noise=noise
+            )
+            if self.config.predict_relative_actions and actions.ndim == 3:
+                # If the model predicts relative actions, we need to unpad the actions
+                # and then convert them to absolute actions.
+                if self.config.relative_actions_mode == "first":
+                    actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :1]), dim=1)
+                elif self.config.relative_actions_mode == "state":
+                    actions = actions + state.unsqueeze(1)
+                else:
+                    actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :-1]), dim=1)
+            # Unpad actions
+            original_action_dim = self.config.action_feature.shape[0]
+            actions = actions[:, :, :original_action_dim]
+
+            actions = self.unnormalize_outputs({"action": actions})["action"]
+
+            if self.config.adapt_to_pi_aloha:
+                actions = self._pi_aloha_encode_actions(actions)
+
+            # `self.model.forward` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue
+            # effectively has shape (n_action_steps, batch_size, *), hence the transpose.
+            self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps])
+        return self._queues[ACTION].popleft()
+
+    def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]:
+        """Do a full training forward pass to compute the loss"""
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+            batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
+        batch = self.normalize_inputs(batch)
+        batch = self.normalize_targets(batch)
+        images, img_masks = self.prepare_images(
+            batch
+        )  # FIXME(mshukor): adapte it to take into account already padded images in the batch
+        state = self.prepare_state(batch)
+        lang_tokens, lang_masks = self.prepare_language(batch)
+        actions = self.prepare_action(batch, state=state)
+        actions_is_pad = batch.get("actions_id_pad")
+        loss_dict = {}
+        losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
+        loss_dict["losses_after_forward"] = losses.mean().clone()
+
+        if actions_is_pad is not None:
+            in_episode_bound = ~actions_is_pad
+            losses = losses * in_episode_bound.unsqueeze(-1)
+            loss_dict["losses_after_in_ep_bound"] = losses.mean().clone()
+
+        # Remove padding
+        losses = losses[:, :, : self.config.max_action_dim]
+        loss_dict["losses_after_rm_padding"] = losses.mean().clone()
+
+        # For backward pass
+        loss = losses.mean()
+        # For backward pass
+        loss_dict["loss"] = loss
+        # # For logging
+        # loss_dict["l2_loss"] = loss.item() # remove for torch compile
+        return loss_dict
+
+    def prepare_images(self, batch):
+        """Apply Pi0 preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and
+        convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP.
+        """
+        images = []
+        img_masks = []
+        present_img_keys = [key for key in self.config.image_features if key in batch]
+        missing_img_keys = [key for key in self.config.image_features if key not in batch]
+
+        present_img_keys = sorted(present_img_keys, key=lambda k: IMAGES_ORDER.get(k, float("inf")), reverse=self.config.reverse_images_order)
+        if self.config.shuffle_camera_positions and ACTION in batch: # only during training
+            present_img_keys = random.sample(present_img_keys, len(present_img_keys))
+        if len(present_img_keys) == 0:
+            raise ValueError(
+                f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})"
+            )
+        for i in range(self.num_past_images):
+            # Preprocess image features present in the batch
+            for key in present_img_keys:
+                img = batch[key][:, i, :, :, :] if batch[key].ndim == 5 else batch[key]
+                if self.config.resize_imgs_with_padding is not None:
+                    img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0)
+
+                # Normalize from range [0,1] to [-1,1] as expacted by siglip
+                img = img * 2.0 - 1.0
+
+                bsize = img.shape[0]
+                device = img.device
+                if f"{key}_padding_mask" in batch:
+                    mask = batch[f"{key}_padding_mask"].bool()
+                else:
+                    mask = torch.ones(bsize, dtype=torch.bool, device=device)
+                images.append(img)
+                img_masks.append(mask)
+
+            # Create image features not present in the batch
+            # as fully 0 padded images.
+            for num_empty_cameras in range(len(missing_img_keys)):
+                if num_empty_cameras >= self.config.empty_cameras:
+                    break
+                img = torch.ones_like(img) * -1
+                mask = torch.zeros_like(mask)
+                images.append(img)
+                img_masks.append(mask)
+        return images, img_masks
+
+    def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
+        """Tokenize the text input"""
+        device = batch[OBS_STATE].device
+        tasks = batch["task"]
+        if len(tasks) == 1:
+            tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])]
+
+        if self.config.add_prompt_template:
+            tasks = [f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}" for task in tasks]
+        else:
+            tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
+        tokenized_prompt = self.language_tokenizer.__call__(
+            tasks,
+            padding=self.config.pad_language_to,
+            padding_side="right",
+            max_length=self.config.tokenizer_max_length,
+            return_tensors="pt",
+            truncation=True, # FIXME(mshukor)
+        )
+
+        lang_tokens = tokenized_prompt["input_ids"].to(device=device)
+        lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool)
+
+        return lang_tokens, lang_masks
+
+    def _pi_aloha_decode_state(self, state):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            state[:, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx])
+        return state
+
+    def _pi_aloha_encode_actions(self, actions):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx])
+        return actions
+
+    def _pi_aloha_encode_actions_inv(self, actions):
+        # Flip the joints again.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx])
+        return actions
+
+    def prepare_state(self, batch):
+        """Pad state"""
+        state = batch[OBS_STATE][:, -1, :] if (batch[OBS_STATE].ndim > 2 and not self.include_past_states) else batch[OBS_STATE] # FIXME(mshukor): no state history for now
+        state = pad_vector(state, self.config.max_state_dim)
+        return state
+
+    def prepare_action(self, batch, state=None):
+        """Pad action"""
+        actions = pad_vector(batch[ACTION], self.config.max_action_dim)
+        if self.config.predict_relative_actions and actions.ndim == 3:
+            if self.config.relative_actions_mode == "first":
+                actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :1]), dim=1)
+            elif self.config.relative_actions_mode == "state":
+                assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], "Relative action mode 'state' requires the action and state to have the same dimension."
+                if state.ndim == 2:
+                    state = state.unsqueeze(1)
+                actions = actions - state
+            else:
+                actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :-1]), dim=1)
+        return actions
+
+def pad_tensor(tensor, max_len, pad_value=0):
+    """
+    Efficiently pads a tensor along sequence dimension to match max_len.
+
+    Args:
+        tensor (torch.Tensor): Shape (B, L, ...) or (B, L).
+        max_len (int): Fixed sequence length.
+        pad_value (int/float): Value for padding.
+
+    Returns:
+        torch.Tensor: Shape (B, max_len, ...) or (B, max_len).
+    """
+    B, L = tensor.shape[:2]
+    
+    # Create a padded tensor of max_len and copy the existing values
+    padded_tensor = torch.full((B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device)
+    padded_tensor[:, :L] = tensor  # Efficient in-place copy
+
+    return padded_tensor
+
+class VLAFlowMatching(nn.Module):
+    """
+    π0: A Vision-Language-Action Flow Model for General Robot Control
+
+    [Paper](https://www.physicalintelligence.company/download/pi0.pdf)
+    [Jax code](https://github.com/Physical-Intelligence/openpi)
+
+    Designed by Physical Intelligence. Ported from Jax by Hugging Face.
+    ┌──────────────────────────────┐
+    │               actions        │
+    │               ▲              │
+    │              ┌┴─────┐        │
+    │  kv cache    │Gemma │        │
+    │  ┌──────────►│Expert│        │
+    │  │           │      │        │
+    │ ┌┴────────┐  │x 10  │        │
+    │ │         │  └▲──▲──┘        │
+    │ │   VLM   │   │  │           │
+    │ │         │   │  robot state │
+    │ │         │   noise          │
+    │ └▲──▲─────┘                  │
+    │  │  │                        │
+    │  │  image(s)                 │
+    │  language tokens             │
+    └──────────────────────────────┘
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.vlm_with_expert = SmolVLMWithExpertModel(model_id=self.config.vlm_model_name, 
+            freeze_vision_encoder=self.config.freeze_vision_encoder,
+            train_expert_only=self.config.train_expert_only,
+            attention_implementation=self.config.attention_implementation,
+            load_vlm_weights=self.config.load_vlm_weights,
+            attention_mode=self.config.attention_mode,
+            num_expert_layers=self.config.num_expert_layers,
+            num_vlm_layers=self.config.num_vlm_layers,
+            self_attn_every_n_layers=self.config.self_attn_every_n_layers,
+            expert_width_multiplier=self.config.expert_width_multiplier,
+            self_attn_only_actions=self.config.self_attn_only_actions,
+            )
+        # self.paligemma_with_expert = self.configure_peft(paligemma_with_expert)
+        self.vlm_with_expert.configure_peft(config=self.config)
+        # Projections are float32
+        self.state_to_prefix = self.config.state_to_prefix
+        if self.state_to_prefix:
+            self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size)
+        else:
+            self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.expert_hidden_size)
+        self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size)
+        self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim)
+
+        self.action_time_mlp_in = nn.Linear(self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size)
+        self.action_time_mlp_out = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size)
+
+        self.set_requires_grad()
+        # SmolVLM2 has: [fake_tok + crop_tok + crop + fake_tok + crop_tok ... + fake_tok + global_tok + global + fake_tok] + [second image] + ...
+        if  any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]):
+            if "SmolVLM-Instruct" in self.config.vlm_model_name:
+                self.fake_image_token =  49152
+                self.global_image_token =  [44, 13906, 29, 6266, 46]
+                self.global_image_start_token = torch.tensor([self.fake_image_token] + self.global_image_token, dtype=torch.long)
+            else:
+                self.fake_image_token =  49189
+                self.global_image_token =  49152
+                self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long)
+        else:
+            self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id
+            self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id
+            self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long)
+
+        self.add_image_special_tokens = self.config.add_image_special_tokens
+        self.add_local_special_image_tokens = self.config.add_local_special_image_tokens
+        self.local_image_tokens = [torch.tensor([self.fake_image_token, tok], dtype=torch.long) for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167]] # assume 3 x 3 grid
+        
+        self.local_image_start_token = self.global_image_start_token
+        self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long)
+        self.prefix_length = self.config.prefix_length
+        self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",")
+        self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1
+        self.causal_attention_on_history = self.config.causal_attention_on_history
+        
+        
+        
+
+    # def configure_peft(self, model):
+    #     # return model
+    #     self.peft_method = self.config.peft_method
+    #     if "lora" in self.peft_method:
+    #         peft_config = self.config.peft_config
+    #         target_modules = peft_config.target_modules
+    #         if not isinstance(target_modules, list):
+    #             target_modules = target_modules.split(",")
+    #         lora_config = LoraConfig(
+    #             task_type=TaskType.CAUSAL_LM,  # Based on the task type (e.g., language modeling, etc.)
+    #             r=peft_config.r,  # The rank of the low-rank adaptation
+    #             lora_alpha=peft_config.lora_alpha,  # Scaling factor
+    #             lora_dropout=peft_config.lora_dropout,  # Dropout applied to LoRA layers
+    #             target_modules=target_modules,  # The components where LoRA is applied
+    #             exclude_modules=["gemma_expert", "model.gemma_expert.model.layers"], # FIXME(mshukor): this does not work for now
+    #         )
+    #         # LoraConfig(task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=1, lora_dropout=0, target_modules=["q_proj"], exclude_modules=["gemma_expert"])
+    #         self.lora_config = lora_config
+    #         # Apply LoRA and ensure only LoRA parameters are trainable
+
+    #         model = get_peft_model(model, lora_config)
+    #         assert self.config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here?
+    #         for name, param in model.named_parameters():
+    #             if (
+    #                 "lora" in name
+    #             ):  # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer
+    #                 param.requires_grad = True
+    #     return model
+
+    def set_requires_grad(self):
+        for params in self.state_proj.parameters():
+            params.requires_grad = self.config.train_state_proj
+
+    def sample_noise(self, shape, device):
+        noise = torch.normal(
+            mean=0.0,
+            std=1.0,
+            size=shape,
+            dtype=torch.float32,
+            device=device,
+        )
+        return noise
+
+    def sample_time(self, bsize, device):
+        time_beta = sample_beta(1.5, 1.0, bsize, device)
+        time = time_beta * 0.999 + 0.001
+        return time.to(dtype=torch.float32, device=device)
+
+    def embed_prefix(
+        self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Embed images with SigLIP and language tokens with embedding layer to prepare
+        for SmolVLM transformer processing.
+        """
+        # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty
+        embs = []
+        pad_masks = []
+        att_masks = []
+        num_images = len(images) // self.num_past_images
+        # TODO: remove for loop
+        for img_idx, (
+            img,
+            img_mask,
+        ) in enumerate(zip(images, img_masks, strict=False)):
+            # FIXME(mshukor):  add special tokens for the history each history_steps or not 
+            if self.add_image_special_tokens:
+                if self.add_local_special_image_tokens and img_idx % num_images != num_images - 1:
+                    local_token_idx = img_idx % num_images
+                    image_start_token = self.vlm_with_expert.embed_language_tokens(self.local_image_tokens[local_token_idx].to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1)
+                else:
+                    image_start_token = self.vlm_with_expert.embed_language_tokens(self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1)
+                image_start_mask = torch.ones_like(image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device)
+                if self.causal_attention_on_history and img_idx % num_images == 0:
+                    att_masks += [1] + [0] * (image_start_mask.shape[-1] - 1)
+                else:
+                    att_masks += [0] * (image_start_mask.shape[-1])
+                embs.append(image_start_token)
+                pad_masks.append(image_start_mask)
+
+            img_emb = self.vlm_with_expert.embed_image(img)
+            img_emb = img_emb #.to(dtype=self.vlm_with_expert.type)
+
+            # Normalize image embeddings
+            img_emb_dim = img_emb.shape[-1]
+            img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device)
+
+            bsize, num_img_embs = img_emb.shape[:2]
+            img_mask = img_mask[:, None].expand(bsize, num_img_embs)
+
+            # FIXME(mshukor): add special image tokens. Assume no tiling fake global images fake
+            # template <|im_start|>User: What actions? image tokens \nAssistant: or processor.apply_chat_template?
+            # processor.fake_image_token
+            # processor.global_image_token
+
+            embs.append(img_emb)
+            pad_masks.append(img_mask)
+
+            att_masks += [0] * (num_img_embs)
+            if self.add_image_special_tokens:
+                if not self.add_local_special_image_tokens or (self.add_local_special_image_tokens and img_idx % num_images == num_images - 1):
+                    image_end_token = self.vlm_with_expert.embed_language_tokens(self.image_end_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1)
+                    image_end_mask = torch.ones_like(image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device)
+                    embs.append(image_end_token)
+                    pad_masks.append(image_end_mask)
+                    att_masks += [0] * (image_end_mask.shape[1])
+        lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens)
+        # Normalize language embeddings
+        lang_emb_dim = lang_emb.shape[-1]
+        lang_emb = lang_emb * math.sqrt(lang_emb_dim) # FIXME(mshukor): is this needed for smolvlm?
+
+        embs.append(lang_emb)
+        pad_masks.append(lang_masks)
+
+        # full attention between image and language inputs
+        num_lang_embs = lang_emb.shape[1]
+        att_masks += [0] * num_lang_embs
+
+        if state is not None and self.state_to_prefix:
+            state_emb = self.state_proj(state)
+            state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type)
+            embs.append(state_emb)
+            bsize = state_emb.shape[0]
+            dtype = state_emb.dtype
+            device = state_emb.device
+
+            states_seq_len = state_emb.shape[1]
+            state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device)
+            pad_masks.append(state_mask)
+
+            # Set attention masks so that image and language inputs do not attend to state or actions
+            # att_masks += [1] + [0]*(states_seq_len - 1)
+            att_masks += [1]*(states_seq_len)
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
+        att_masks = att_masks[None, :]
+
+        seq_len = pad_masks.shape[1]
+        if seq_len < self.prefix_length:
+            embs = pad_tensor(embs, self.prefix_length, pad_value=0)
+            pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0)
+            att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0)
+
+        att_masks = att_masks.expand(bsize, -1)
+
+        return embs, pad_masks, att_masks
+
+    def embed_suffix(self, state, noisy_actions, timestep):
+        """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing."""
+        embs = []
+        pad_masks = []
+        att_masks = []
+
+        # Embed state
+        if not self.state_to_prefix:
+            state_emb = self.state_proj(state)
+            state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type)
+            embs.append(state_emb)
+            bsize = state_emb.shape[0]
+            dtype = state_emb.dtype
+            device = state_emb.device
+
+            states_seq_len = state_emb.shape[1]
+            state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device)
+            pad_masks.append(state_mask)
+
+            # Set attention masks so that image and language inputs do not attend to state or actions
+            att_masks += [1] + [0]*(states_seq_len - 1)
+
+
+        # Fuse timestep + action information using an MLP
+        action_emb = self.action_in_proj(noisy_actions)
+        device = action_emb.device
+        bsize = action_emb.shape[0]
+        dtype = action_emb.dtype
+        # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
+        time_emb = create_sinusoidal_pos_embedding(
+            timestep, self.vlm_with_expert.expert_hidden_size, min_period=4e-3, max_period=4.0, device=device
+        )
+        time_emb = time_emb.type(dtype=dtype)
+
+        time_emb = time_emb[:, None, :].expand_as(action_emb)
+        action_time_emb = torch.cat([action_emb, time_emb], dim=2)
+
+        action_time_emb = self.action_time_mlp_in(action_time_emb)
+        action_time_emb = F.silu(action_time_emb)  # swish == silu
+        action_time_emb = self.action_time_mlp_out(action_time_emb)
+
+        # Add to input tokens
+        embs.append(action_time_emb)
+
+        bsize, action_time_dim = action_time_emb.shape[:2]
+        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device)
+        pad_masks.append(action_time_mask)
+
+        # Set attention masks so that image, language and state inputs do not attend to action tokens
+        if self.config.causal_action_attention_mask:
+            att_masks += [1] * self.config.chunk_size
+        else:
+            att_masks += [1] + ([0] * (self.config.chunk_size - 1))
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
+        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        return embs, pad_masks, att_masks
+
+    def forward(
+        self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
+    ) -> Tensor:
+        """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
+        if noise is None:
+            noise = self.sample_noise(actions.shape, actions.device)
+
+        if time is None:
+            time = self.sample_time(actions.shape[0], actions.device)
+
+        time_expanded = time[:, None, None]
+        if self.config.regression_loss:
+            # Hack to compare regression to flow matching
+            time = torch.zeros_like(time, dtype=time.dtype, device=time.device)
+            x_t = torch.zeros_like(actions, dtype=actions.dtype, device=actions.device)
+            u_t = actions
+        else:
+            x_t = time_expanded * noise + (1 - time_expanded) * actions
+            u_t = noise - actions
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+            images, img_masks, lang_tokens, lang_masks, state=state
+        )
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, time)
+        
+        pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
+        att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
+
+        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+        position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        (_, suffix_out), _ = self.vlm_with_expert.forward(
+            attention_mask=att_2d_masks,
+            position_ids=position_ids,
+            past_key_values=None,
+            inputs_embeds=[prefix_embs, suffix_embs],
+            use_cache=False,
+            fill_kv_cache=False,
+        )
+        suffix_out = suffix_out[:, -self.config.chunk_size :]
+        # Original openpi code, upcast attention output
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        v_t = self.action_out_proj(suffix_out)
+        if self.config.regression_loss:
+            losses = F.l1_loss(u_t, v_t, reduction="none")
+        else:
+            losses = F.mse_loss(u_t, v_t, reduction="none")
+        return losses
+
+    def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor:
+        """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
+        bsize = state.shape[0]
+        device = state.device
+
+        if noise is None:
+            actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim)
+            noise = self.sample_noise(actions_shape, device)
+
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+            images, img_masks, lang_tokens, lang_masks, state=state
+        )
+        prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
+        prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
+        # Compute image and language key value cache
+        _, past_key_values = self.vlm_with_expert.forward(
+            attention_mask=prefix_att_2d_masks,
+            position_ids=prefix_position_ids,
+            past_key_values=None,
+            inputs_embeds=[prefix_embs, None],
+            use_cache=self.config.use_cache,
+            fill_kv_cache=True,
+        )
+        if self.config.regression_loss:
+            x_t = torch.zeros_like(noise, dtype=torch.float32, device=device)
+            expanded_time = torch.zeros(bsize, dtype=torch.float32, device=device)
+            x_t = self.denoise_step(
+                                state,
+                                prefix_pad_masks,
+                                past_key_values,
+                                x_t,
+                                expanded_time,
+                            )
+        else:
+            dt = -1.0 / self.config.num_steps
+            dt = torch.tensor(dt, dtype=torch.float32, device=device)
+
+            x_t = noise
+            time = torch.tensor(1.0, dtype=torch.float32, device=device)
+            while time >= -dt / 2:
+                expanded_time = time.expand(bsize)
+                v_t = self.denoise_step(
+                    state,
+                    prefix_pad_masks,
+                    past_key_values,
+                    x_t,
+                    expanded_time,
+                )
+
+                # Euler step
+                x_t += dt * v_t
+                time += dt
+        return x_t
+
+    def denoise_step(
+        self,
+        state,
+        prefix_pad_masks,
+        past_key_values,
+        x_t,
+        timestep,
+    ):
+        """Apply one denoising step of the noise `x_t` at a given timestep."""
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, timestep)
+
+        suffix_len = suffix_pad_masks.shape[1]
+        batch_size = prefix_pad_masks.shape[0]
+        prefix_len = prefix_pad_masks.shape[1]
+        prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)
+
+        suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
+
+        full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
+        prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
+        position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
+
+        outputs_embeds, _ = self.vlm_with_expert.forward(
+            attention_mask=full_att_2d_masks,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=[None, suffix_embs],
+            use_cache=self.config.use_cache,
+            fill_kv_cache=False,
+        )
+        suffix_out = outputs_embeds[1]
+        suffix_out = suffix_out[:, -self.config.chunk_size :]
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        v_t = self.action_out_proj(suffix_out)
+        return v_t
diff --git a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py
new file mode 100644
index 000000000..b910679f1
--- /dev/null
+++ b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py
@@ -0,0 +1,824 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+from functools import partial
+import copy
+
+import torch
+import torch.version
+import torch.nn.functional as F  # noqa: N812
+from peft import LoraConfig, TaskType, get_peft_model
+from pytest import Cache
+from torch import nn
+from transformers import (
+    AutoConfig,
+    GemmaForCausalLM,
+    AutoModelForImageTextToText,
+    AutoProcessor,
+    PretrainedConfig,
+    PreTrainedModel,
+    SmolVLMForConditionalGeneration,
+    AutoModel,
+    AutoModelForVision2Seq,
+)
+from transformers.models.auto import CONFIG_MAPPING
+from transformers import SmolVLMModel, SmolVLMConfig
+from lerobot.policies.smolpi0.flex_attention import flex_attention_forward
+
+def _round_up_to_multiple(x, multiple):
+    return (x + multiple - 1) // multiple * multiple
+
+
+def apply_rope(x, positions, max_wavelength=10_000):
+    """
+    Applies RoPE positions [B, L] to x [B, L, H, D].
+    """
+    d_half = x.shape[-1] // 2
+    device = x.device
+    dtype = x.dtype
+    x = x.to(torch.float32)
+
+    freq_exponents = (2.0 / x.shape[-1]) * torch.arange(d_half, dtype=torch.float32, device=device)
+    timescale = max_wavelength**freq_exponents
+    radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(torch.float32)
+
+    radians = radians[..., None, :]
+
+    sin = torch.sin(radians)  # .to(dtype=dtype)
+    cos = torch.cos(radians)  # .to(dtype=dtype)
+
+    x1, x2 = x.split(d_half, dim=-1)
+    res = torch.empty_like(x)
+    res[..., :d_half] = x1 * cos - x2 * sin
+    res[..., d_half:] = x2 * cos + x1 * sin
+
+    return res.to(dtype)
+
+
+# class SmolVLMWithExpertConfig(PretrainedConfig):
+#     model_type = "SmolVLMWithExpertModel"
+#     sub_configs = {"smolvlm_config": AutoConfig, "lm_expert_config": AutoConfig}
+
+#     def __init__(
+#         self,
+#         smolvlm_config: dict | None = None,
+#         lm_expert_config: dict | None = None,
+#         freeze_vision_encoder: bool = True,
+#         train_expert_only: bool = True,
+#         attention_implementation: str = "eager",
+#         load_vlm_weights: bool = False,
+#         **kwargs,
+#     ):
+#         self.load_vlm_weights = load_vlm_weights
+#         self.freeze_vision_encoder = freeze_vision_encoder
+#         self.train_expert_only = train_expert_only
+#         self.attention_implementation = attention_implementation
+
+#         if smolvlm_config is None:
+#             # Default config from Pi0
+#             self.smolvlm_config = CONFIG_MAPPING["smolvlm"](
+#                 transformers_version="4.48.1",
+#                 _vocab_size=257152,
+#                 bos_token_id=2,
+#                 eos_token_id=1,
+#                 hidden_size=2048,
+#                 image_token_index=257152,
+#                 model_type="smolvlm",
+#                 pad_token_id=0,
+#                 projection_dim=2048,
+#                 text_config={
+#                     "hidden_activation": "gelu_pytorch_tanh",
+#                     "hidden_size": 2048,
+#                     "intermediate_size": 16384,
+#                     "model_type": "gemma",
+#                     "num_attention_heads": 8,
+#                     "num_hidden_layers": 18,
+#                     "num_image_tokens": 256,
+#                     "num_key_value_heads": 1,
+#                     "torch_dtype": "float32",
+#                     "vocab_size": 257152,
+#                 },
+#                 vision_config={
+#                     "hidden_size": 1152,
+#                     "intermediate_size": 4304,
+#                     "model_type": "siglip_vision_model",
+#                     "num_attention_heads": 16,
+#                     "num_hidden_layers": 27,
+#                     "num_image_tokens": 256,
+#                     "patch_size": 14,
+#                     "projection_dim": 2048,
+#                     "projector_hidden_act": "gelu_fast",
+#                     "torch_dtype": "float32",
+#                     "vision_use_head": False,
+#                 },
+#             )
+#         elif isinstance(self.paligemma_config, dict):
+#             # Override Pi0 default config for PaliGemma
+#             if "model_type" not in gemma_expert_config:
+#                 paligemma_config["model_type"] = "paligemma"
+
+#             cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]]
+#             self.paligemma_config = cfg_cls(**paligemma_config)
+
+#         if gemma_expert_config is None:
+#             # Default config from Pi0
+#             self.gemma_expert_config = CONFIG_MAPPING["gemma"](
+#                 attention_bias=False,
+#                 attention_dropout=0.0,
+#                 bos_token_id=2,
+#                 eos_token_id=1,
+#                 head_dim=256,
+#                 hidden_act="gelu_pytorch_tanh",
+#                 hidden_activation="gelu_pytorch_tanh",
+#                 hidden_size=1024,
+#                 initializer_range=0.02,
+#                 intermediate_size=4096,
+#                 max_position_embeddings=8192,
+#                 model_type="gemma",
+#                 num_attention_heads=8,
+#                 num_hidden_layers=18,
+#                 num_key_value_heads=1,
+#                 pad_token_id=0,
+#                 rms_norm_eps=1e-06,
+#                 rope_theta=10000.0,
+#                 torch_dtype="float32",
+#                 transformers_version="4.48.1",
+#                 use_cache=True,
+#                 vocab_size=257152,
+#             )
+#         elif isinstance(self.gemma_expert_config, dict):
+#             # Override Pi0 default config for Gemma Expert
+#             if "model_type" not in gemma_expert_config:
+#                 gemma_expert_config["model_type"] = "gemma"
+
+#             cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]]
+#             self.gemma_expert_config = cfg_cls(**gemma_expert_config)
+
+#         super().__init__(**kwargs)
+
+#     def __post_init__(self):
+#         super().__post_init__()
+#         if self.train_expert_only and not self.freeze_vision_encoder:
+#             raise ValueError(
+#                 "You set `freeze_vision_encoder=False` and `train_expert_only=True` which are not compatible."
+#             )
+
+#         if self.attention_implementation not in ["eager", "fa2", "flex"]:
+#             raise ValueError(
+#                 f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'."
+#             )
+
+def get_intermediate_size(hidden_dim, ffn_dim_multiplier=4, multiple_of=256):
+    hidden_dim = int(2 * hidden_dim / 3)
+    hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+    hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+    return hidden_dim
+    
+
+class SmolVLMWithExpertModel(nn.Module):
+    # config_class = PaliGemmaWithExpertConfig
+
+    def __init__(self, model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", 
+    load_vlm_weights: bool = True, train_expert_only: bool = True, freeze_vision_encoder: bool = False, 
+    attention_implementation: str = "eager", attention_mode: str = "self_attn", num_expert_layers: int = -1, 
+    num_vlm_layers: int = -1, self_attn_every_n_layers: int = -1, expert_width_multiplier: float = 0.5, self_attn_only_actions: bool = False):
+        super().__init__()
+        if load_vlm_weights:
+            print(f"Loading  {model_id} weights ...")
+            if "SmolVLM-" in model_id:
+                self.vlm = AutoModelForVision2Seq.from_pretrained(
+                    model_id,
+                    device_map="cuda",
+                    torch_dtype="bfloat16",
+                    low_cpu_mem_usage=True,
+                )
+            else:
+                # model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+                self.vlm = AutoModelForImageTextToText.from_pretrained(
+                    model_id,
+                    device_map="cuda",
+                    torch_dtype="bfloat16",
+                    low_cpu_mem_usage=True,
+                    # attn_implementation="eager",
+                    # attn_implementation="flash_attention_2"
+                )
+            config = self.vlm.config
+        else:
+            config = AutoConfig.from_pretrained(model_id)
+            self.vlm = SmolVLMForConditionalGeneration(config=config)
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        if num_vlm_layers > 0:
+            print(f"Reducing the number of VLM layers to {num_vlm_layers} ...")
+            self.get_vlm_model().text_model.layers = self.get_vlm_model().text_model.layers[:num_vlm_layers]
+        self.num_vlm_layers = len(self.get_vlm_model().text_model.layers)
+        self.config = config
+        # Smaller lm expert
+        lm_expert_config = copy.deepcopy(config.text_config)
+        hidden_size = lm_expert_config.hidden_size
+        lm_expert_config.hidden_size = int(hidden_size*expert_width_multiplier) #hidden_size // 2
+        lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size*expert_width_multiplier))
+        lm_expert_config.num_hidden_layers = self.num_vlm_layers
+        if num_expert_layers > 0 :
+            assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}"
+            lm_expert_config.num_hidden_layers = num_expert_layers
+        # lm_expert_config.head_dim = lm_expert_config.head_dim * 2
+        self.lm_expert = AutoModel.from_config(lm_expert_config)
+        
+        self.num_expert_layers = len(self.lm_expert.layers)
+        self.self_attn_every_n_layers = self_attn_every_n_layers
+        self.self_attn_only_actions = self_attn_only_actions
+        if "cross" in attention_mode:
+            # Reshape qkv projections to have the same input dimension as the vlm
+            for layer_idx in range(len(self.lm_expert.layers)):
+                if self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0:
+                    continue
+                self.lm_expert.layers[layer_idx].self_attn.k_proj = nn.Linear(
+                    config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias
+                )
+                self.lm_expert.layers[layer_idx].self_attn.v_proj = nn.Linear(
+                    config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias
+                )
+        # Remove unused embed_tokens
+        self.lm_expert.embed_tokens = None
+
+        self.num_attention_heads = self.config.text_config.num_attention_heads
+        self.num_key_value_heads = self.config.text_config.num_key_value_heads
+
+        self.freeze_vision_encoder = freeze_vision_encoder
+        self.train_expert_only = train_expert_only
+        self.attention_implementation = attention_implementation
+        self.attention_mode = attention_mode
+        self.expert_hidden_size = lm_expert_config.hidden_size
+        # self.to_bfloat16_like_physical_intelligence()
+        self.set_requires_grad()
+
+    def configure_peft(self, config):
+        # return model
+        self.peft_method = config.peft_method
+        self.peft_target_model = config.peft_target_model
+        if "lora" in self.peft_method:
+            peft_config = config.peft_config
+            target_modules = peft_config.target_modules
+            if not isinstance(target_modules, list):
+                target_modules = target_modules.split(",")
+            lora_config = LoraConfig(
+                task_type=TaskType.CAUSAL_LM,  # Based on the task type (e.g., language modeling, etc.)
+                r=peft_config.r,  # The rank of the low-rank adaptation
+                lora_alpha=peft_config.lora_alpha,  # Scaling factor
+                lora_dropout=peft_config.lora_dropout,  # Dropout applied to LoRA layers
+                target_modules=target_modules,  # The components where LoRA is applied
+                exclude_modules=[
+                    "lm_expert",
+                    "model.lm_expert.model.layers",
+                ],  # FIXME(mshukor): this does not work for now
+            )
+            self.lora_config = lora_config
+            # Apply LoRA and ensure only LoRA parameters are trainable
+            if "text" in self.peft_target_model:
+                self.get_vlm_model().text_model = get_peft_model(self.get_vlm_model().text_model, lora_config)
+            else:
+                self.vlm = get_peft_model(self.vlm, lora_config)
+            # assert config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here?
+            for name, param in self.vlm.named_parameters():
+                if (
+                    "lora" in name and "text_model.model.layers.17" not in name
+                ):  # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer
+                    param.requires_grad = True
+                else:
+                    param.requires_grad = False
+
+    def merge_lora_weights(self):
+        """
+        Merge LoRA weights into the base model.
+        """
+        if "text" in self.peft_target_model:
+            self.get_vlm_model().text_model = self.get_vlm_model().text_model.merge_and_unload()
+        else:
+            self.vlm = self.vlm.merge_and_unload()
+
+    def get_vlm_model(self,):
+        if hasattr(self.vlm.model, "model"): # When using peft
+            return self.vlm.model.model
+        else:
+            return self.vlm.model
+
+    def set_requires_grad(self):
+        if self.freeze_vision_encoder:
+            self.get_vlm_model().vision_model.eval()
+            for params in self.get_vlm_model().vision_model.parameters():
+                params.requires_grad = False
+        if self.train_expert_only:
+            self.vlm.eval()
+            for params in self.vlm.parameters():
+                params.requires_grad = False
+        else:
+            # To avoid unused params issue with distributed training
+            last_layers = [self.num_vlm_layers - 1]
+            if self.num_vlm_layers != self.num_expert_layers and self.num_vlm_layers % self.num_expert_layers == 0:
+                last_layers.append(self.num_vlm_layers - 2)
+            frozen_layers = [
+                            "lm_head",
+                            "text_model.model.norm.weight",
+                        ]
+            for layer in last_layers:
+                frozen_layers.append(f"text_model.model.layers.{layer}.")
+
+            for name, params in self.vlm.named_parameters():
+                if any(
+                    [
+                        k in name
+                        for k in frozen_layers
+                    ]
+                ):
+                    params.requires_grad = False
+        # To avoid unused params issue with distributed training
+        for name, params in self.lm_expert.named_parameters():
+            if any(
+                [
+                    k in name
+                    for k in [
+                        "lm_head",
+                    ]
+                ]
+            ):
+                params.requires_grad = False
+
+    def train(self, mode: bool = True):
+        super().train(mode)
+
+        if self.freeze_vision_encoder:
+            self.get_vlm_model().vision_model.eval()
+
+        if self.train_expert_only:
+            self.vlm.eval()
+
+    # def to_bfloat16_like_physical_intelligence(self):
+    #     self.vlm = self.vlm.to(dtype=torch.bfloat16)
+
+    #     params_to_change_dtype = [
+    #         "language_model.model.layers",
+    #         "gemma_expert.model.layers",
+    #         "vision_tower",
+    #         "multi_modal",
+    #     ]
+    #     for name, param in self.named_parameters():
+    #         if any(selector in name for selector in params_to_change_dtype):
+    #             param.data = param.data.to(dtype=torch.bfloat16)
+
+    def embed_image(self, image: torch.Tensor):
+        patch_attention_mask = None
+        # # FIXME(mshukor): probably not needed as we don't have padded images here
+        # pixel_values = image.unsqueeze(1)
+        # batch_size, num_images, num_channels, height, width = pixel_values.shape
+        # pixel_values = pixel_values
+        # pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
+
+        # # Remove padding images - padding images are full 0.
+        # nb_values_per_image = pixel_values.shape[1:].numel()
+        # real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+
+        # if not any(real_images_inds):
+        #     # no images, leave one empty image.
+        #     real_images_inds[0] = True
+
+        # pixel_values = pixel_values[real_images_inds].contiguous()
+
+        # # Handle the vision attention mask
+
+        # pixel_attention_mask = torch.ones(
+        #     size=[pixel_values.shape[i] for i in (0, 2, 3)],
+        #     dtype=torch.bool,
+        #     device=pixel_values.device,
+        # )
+
+        # patch_size = self.vlm.config.vision_config.patch_size
+        # patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+        # patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+        # patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+        # FIXME(mshukor): add special image tokens specific to smolvlm
+        # Get sequence from the vision encoder
+        image_hidden_states = self.get_vlm_model().vision_model(
+            pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype),
+            patch_attention_mask=patch_attention_mask,
+        ).last_hidden_state
+        # Modality projection & resampling
+        image_hidden_states = self.get_vlm_model().connector(image_hidden_states)
+        return image_hidden_states
+        
+    def embed_language_tokens(self, tokens: torch.Tensor):
+        return self.get_vlm_model().text_model.get_input_embeddings()(tokens)
+
+    def forward_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values=None) -> list[torch.Tensor]:
+
+        query_states = []
+        key_states = []
+        value_states = []
+        for i, hidden_states in enumerate(inputs_embeds):
+            layer = model_layers[i][layer_idx]
+            if hidden_states is None or layer is None:
+                continue
+            
+            # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype)
+            # hidden_states = hidden_states * normalizer
+            hidden_states = layer.input_layernorm(hidden_states)
+
+            input_shape = hidden_states.shape[:-1]
+            hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+
+            hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype)
+            query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+            key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+            value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+
+            query_states.append(query_state)
+            key_states.append(key_state)
+            value_states.append(value_state)
+
+        # FIXME(mshukor): self attention always when having only the prefix
+        # B,L,H,D with L sequence length, H number of heads, D head dim
+        # concatenate on the number of embeddings/tokens
+        query_states = torch.cat(query_states, dim=1)
+        key_states = torch.cat(key_states, dim=1)
+        value_states = torch.cat(value_states, dim=1)
+        # FIXME(mshukor): seq should be B, H, L, D ?
+        seq_len = query_states.shape[1]
+        if seq_len < position_ids.shape[1]:
+            _position_ids = position_ids[:, :seq_len]
+            _attention_mask = attention_mask[:, :seq_len, :seq_len]
+        else:
+            _position_ids = position_ids
+            _attention_mask = attention_mask
+
+        if self.self_attn_only_actions:
+            attention_mask_ = _attention_mask.clone()
+            position_ids_ = _position_ids.clone()
+            if inputs_embeds[1] is not None:
+                suffix_len = inputs_embeds[1].shape[1]
+                attention_mask_[:, -suffix_len:, :-suffix_len] = False
+                position_ids_[:, -suffix_len:] = _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None]
+        else:
+            attention_mask_ = _attention_mask
+            position_ids_ = _position_ids
+
+        query_states = apply_rope(query_states, position_ids_) # FIXME(mshukor): this assumes we have always the vlm features?
+        key_states = apply_rope(key_states, position_ids_)
+
+        if use_cache and past_key_values is None:
+            past_key_values = {}
+
+        if use_cache:
+            if fill_kv_cache:
+                past_key_values[layer_idx] = {
+                    "key_states": key_states,
+                    "value_states": value_states,
+                }
+            else:
+                # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
+                # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
+                # the max len, then we (for instance) double the cache size. This implementation already exists
+                # in `transformers`. (molbap)
+                key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
+                value_states = torch.cat(
+                    [past_key_values[layer_idx]["value_states"], value_states], dim=1
+                )
+
+        attention_interface = self.get_attention_interface()
+
+        att_output = attention_interface(
+            attention_mask_, batch_size, head_dim, query_states, key_states, value_states
+        )
+        # att_output = att_output.to(dtype=models[i].dtype)
+
+        return [att_output], past_key_values
+
+
+    def forward_cross_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values = None) -> list[torch.Tensor]:
+
+        attention_interface = self.get_attention_interface()
+
+        att_outputs = []
+        assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}"
+
+        if len(inputs_embeds) == 2 and not past_key_values:
+            # Prefix attention
+            seq_len = inputs_embeds[0].shape[1]
+            position_id, expert_position_id = position_ids[:, :seq_len],  position_ids[:, seq_len:]
+            prefix_attention_mask = attention_mask[:, :seq_len, :seq_len]
+            
+            layer = model_layers[0][layer_idx]
+
+            hidden_states = layer.input_layernorm(inputs_embeds[0])
+
+            input_shape = hidden_states.shape[:-1]
+            hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+
+            hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype)
+            query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+            key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+            value_states = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+
+            # B,L,H,D with L sequence length, H number of heads, D head dim
+            query_states = apply_rope(query_state, position_id)
+            key_states = apply_rope(key_state, position_id)
+
+            att_output = attention_interface(
+                prefix_attention_mask, batch_size, head_dim, query_states, key_states, value_states
+            )
+            att_outputs.append(att_output)
+        else:
+            expert_position_id = position_ids
+
+        if use_cache and past_key_values is None:
+            past_key_values = {}
+
+        if use_cache:
+            if fill_kv_cache:
+                past_key_values[layer_idx] = {
+                    "key_states": key_states,
+                    "value_states": value_states,
+                }
+            else:
+                # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
+                # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
+                # the max len, then we (for instance) double the cache size. This implementation already exists
+                # in `transformers`. (molbap)
+                key_states = past_key_values[layer_idx]["key_states"]
+                value_states = past_key_values[layer_idx]["value_states"]
+
+
+        # Expert
+        expert_layer = model_layers[1][layer_idx]
+        if expert_layer is not None:
+            expert_hidden_states = expert_layer.input_layernorm(inputs_embeds[1])
+
+            expert_input_shape = expert_hidden_states.shape[:-1]
+            expert_hidden_shape = (*expert_input_shape, -1, expert_layer.self_attn.head_dim)
+
+            expert_hidden_states = expert_hidden_states.to(dtype=expert_layer.self_attn.q_proj.weight.dtype)
+            expert_query_state = expert_layer.self_attn.q_proj(expert_hidden_states).view(expert_hidden_shape)
+
+
+            _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view(*key_states.shape[:2], -1)
+            expert_key_states = expert_layer.self_attn.k_proj(_key_states).view(*_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim) # k_proj should have same dim as kv
+
+            _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view(*value_states.shape[:2], -1)
+            expert_value_states = expert_layer.self_attn.v_proj(_value_states).view(*_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim)
+
+            expert_position_id = expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values # start from 0
+            expert_attention_mask = attention_mask[:, -inputs_embeds[1].shape[1]:, :expert_key_states.shape[1]:] # take into account kv
+
+            expert_query_states = apply_rope(expert_query_state, expert_position_id)
+            # expert_key_states = apply_rope(expert_key_state, expert_position_id)
+            
+            att_output = attention_interface(
+                expert_attention_mask, batch_size, head_dim, expert_query_states, expert_key_states, expert_value_states
+            )
+            att_outputs.append(att_output)
+        else:
+            att_outputs.append(None)
+
+        # att_output = att_output.to(dtype=models[i].dtype)
+        return att_outputs, past_key_values
+    
+    def get_model_layers(self, models: list) -> list: # FIXME(mshukor): is this efficient?
+        vlm_layers = []
+        expert_layers = []
+        multiple_of = self.num_vlm_layers // self.num_expert_layers
+        for i in range(self.num_vlm_layers):
+            if multiple_of > 0 and i > 0 and i % multiple_of != 0:
+                expert_layer = None
+            else:
+                expert_layer_index = i // multiple_of if multiple_of > 0 else i
+                expert_layer = models[1].layers[expert_layer_index]
+            vlm_layers.append(models[0].layers[i])
+            expert_layers.append(expert_layer)
+        return [vlm_layers, expert_layers]
+    # TODO: break down this huge forward into modules or functions
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        inputs_embeds: List[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        fill_kv_cache: Optional[bool] = None,
+    ):
+        models = [self.get_vlm_model().text_model, self.lm_expert]
+        model_layers = self.get_model_layers(models)
+        for hidden_states in inputs_embeds:
+            # TODO this is very inefficient
+            # dtype is always the same, batch size too (if > 1 len)
+            # device could be trickier in multi gpu edge cases but that's it
+            if hidden_states is None:
+                continue
+            batch_size = hidden_states.shape[0]
+
+        # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train 
+        if self.attention_implementation == "flex":
+            if inputs_embeds[0] is not None and inputs_embeds[1] is not None and attention_mask.shape[-1] == attention_mask.shape[-2] and past_key_values is None: # Now only during training
+                seq_len = inputs_embeds[0].shape[1] + inputs_embeds[1].shape[1]
+                padded_seq_len = _round_up_to_multiple(seq_len, 128) # FIXME(mshukor): more efficient to have a fixed seq len?
+                b_mask, q_len, kv_len = attention_mask.shape  # The shape of your mask
+                pad = padded_seq_len - q_len
+                attention_mask = F.pad(attention_mask, (0, pad, 0, pad), value=True)
+                inputs_embeds[0] = F.pad(inputs_embeds[0], (0, 0, 0, pad), value=0.0)
+                position_ids = F.pad(position_ids, (0, pad), value=0)
+
+
+        # RMSNorm
+        num_layers = self.num_vlm_layers
+        head_dim = self.vlm.config.text_config.head_dim
+        for layer_idx in range(num_layers):
+            if fill_kv_cache or "cross" not in self.attention_mode or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0):
+                att_outputs, past_key_values = self.forward_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values)
+            else:
+                att_outputs, past_key_values = self.forward_cross_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values)
+            # query_states = []
+            # key_states = []
+            # value_states = []
+            # for i, hidden_states in enumerate(inputs_embeds):
+            #     if hidden_states is None:
+            #         continue
+            #     layer = models[i].layers[layer_idx]
+            #     # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype)
+            #     # hidden_states = hidden_states * normalizer
+            #     hidden_states = layer.input_layernorm(hidden_states)
+
+            #     input_shape = hidden_states.shape[:-1]
+            #     hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+
+            #     hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype)
+            #     query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+            #     key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+            #     value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+
+            #     query_states.append(query_state)
+            #     key_states.append(key_state)
+            #     value_states.append(value_state)
+
+            # # FIXME(mshukor): self attention always when having only the prefix
+            # # B,L,H,D with L sequence length, H number of heads, D head dim
+            # # concatenate on the number of embeddings/tokens
+            # query_states = torch.cat(query_states, dim=1)
+            # key_states = torch.cat(key_states, dim=1)
+            # value_states = torch.cat(value_states, dim=1)
+            # # FIXME(mshukor): seq should be B, H, L, D ?
+            # query_states = apply_rope(query_states, position_ids)
+            # key_states = apply_rope(key_states, position_ids)
+
+            # if use_cache and past_key_values is None:
+            #     past_key_values = {}
+
+            # if use_cache:
+            #     if fill_kv_cache:
+            #         past_key_values[layer_idx] = {
+            #             "key_states": key_states,
+            #             "value_states": value_states,
+            #         }
+            #     else:
+            #         # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
+            #         # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
+            #         # the max len, then we (for instance) double the cache size. This implementation already exists
+            #         # in `transformers`. (molbap)
+            #         key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
+            #         value_states = torch.cat(
+            #             [past_key_values[layer_idx]["value_states"], value_states], dim=1
+            #         )
+
+            # attention_interface = self.get_attention_interface()
+            # att_output = attention_interface(
+            #     attention_mask, batch_size, head_dim, query_states, key_states, value_states
+            # )
+
+
+            # att_output = att_output.to(dtype=models[i].dtype)
+
+            # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len])
+            outputs_embeds = []
+            start = 0
+            for i, hidden_states in enumerate(inputs_embeds):
+                # layer = models[i].layers[layer_idx]
+                layer = model_layers[i][layer_idx]
+                att_output = att_outputs[i] if i < len(att_outputs) else att_outputs[0] # in case of self_attn
+                if hidden_states is not None:
+                    if layer is None:
+                        outputs_embeds.append(hidden_states)
+                        continue
+                    end = start + hidden_states.shape[1]
+
+                    if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
+                        att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
+                    att_out = att_output[:, start:end]
+                    out_emb = layer.self_attn.o_proj(att_out)
+
+                    # TODO: first dropout (by default 0.0)
+                    # first residual
+                    out_emb += hidden_states
+                    after_first_residual = out_emb.clone()
+
+                    out_emb = layer.post_attention_layernorm(out_emb)
+                    out_emb = layer.mlp(out_emb)
+
+                    # TODO: second dropout (by default 0.0)
+
+                    # second residual
+                    out_emb += after_first_residual
+
+                    outputs_embeds.append(out_emb)
+
+                    start = end if len(att_outputs) == 1 else 0
+                else:
+                    outputs_embeds.append(None)
+
+            inputs_embeds = outputs_embeds
+
+        # final norm
+        outputs_embeds = []
+        for i, hidden_states in enumerate(inputs_embeds):
+            if hidden_states is not None:
+                out_emb = models[i].norm(hidden_states)
+                outputs_embeds.append(out_emb)
+            else:
+                outputs_embeds.append(None)
+        return outputs_embeds, past_key_values
+
+    def get_attention_interface(self):
+        if self.attention_implementation == "fa2":
+            attention_interface = self.flash_attention_forward
+        elif self.attention_implementation == "flex":
+            attention_interface = partial(flex_attention_forward, num_att_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads)
+        else:
+            attention_interface = self.eager_attention_forward
+        return attention_interface
+
+    def flash_attention_forward(
+        self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
+    ):
+        raise NotImplementedError("FA2 is not implemented (yet)")
+
+    def eager_attention_forward(
+        self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
+    ):
+        num_att_heads = self.num_attention_heads
+        num_key_value_heads = self.num_key_value_heads
+        num_key_value_groups = num_att_heads // num_key_value_heads
+
+        # query_states: batch_size, sequence_length, num_att_head, head_dim
+        # key_states: batch_size, sequence_length, num_key_value_head, head_dim
+        # value_states: batch_size, sequence_length, num_key_value_head, head_dim
+        sequence_length = key_states.shape[1]
+
+        key_states = key_states[:, :, :, None, :].expand(
+            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
+        )
+        key_states = key_states.reshape(
+            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
+        )
+
+        value_states = value_states[:, :, :, None, :].expand(
+            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
+        )
+        value_states = value_states.reshape(
+            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
+        )
+
+        # Attention here is upcasted to float32 to match the original eager implementation.
+
+        query_states = query_states.to(dtype=torch.float32)
+        key_states = key_states.to(dtype=torch.float32)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+
+        att_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+        att_weights *= head_dim**-0.5
+
+        att_weights = att_weights.to(dtype=torch.float32)
+        big_neg = torch.finfo(att_weights.dtype).min #-2.3819763e38  # See gemma/modules.py
+        masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg)
+        probs = nn.functional.softmax(masked_att_weights, dim=-1)
+        probs = probs.to(dtype=value_states.dtype)
+
+        # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length
+        # value_states: batch_size, sequence_length, num_att_heads, head_dim
+
+        att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3))
+
+        att_output = att_output.permute(0, 2, 1, 3)
+        # we use -1 because sequence length can change
+        att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim)
+
+        return att_output
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py
index 95ed993d2..9b7e3520a 100644
--- a/src/lerobot/policies/smolvla/modeling_smolvla.py
+++ b/src/lerobot/policies/smolvla/modeling_smolvla.py
@@ -1,3 +1,955 @@
+# #!/usr/bin/env python
+
+# # Copyright 2025 HuggingFace Inc. team. All rights reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+
+# """
+# SmolVLA:
+
+# [Paper](https://huggingface.co/papers/2506.01844)
+
+# Designed by Hugging Face.
+
+# Install smolvla extra dependencies:
+# ```bash
+# pip install -e ".[smolvla]"
+# ```
+
+# Example of finetuning the smolvla pretrained model (`smolvla_base`):
+# ```bash
+# lerobot-train \
+# --policy.path=lerobot/smolvla_base \
+# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \
+# --batch_size=64 \
+# --steps=200000
+# ```
+
+# Example of finetuning a smolVLA. SmolVLA is composed of a pretrained VLM,
+# and an action expert.
+# ```bash
+# lerobot-train \
+# --policy.type=smolvla \
+# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \
+# --batch_size=64 \
+# --steps=200000
+# ```
+
+# Example of using the smolvla pretrained model outside LeRobot training framework:
+# ```python
+# policy = SmolVLAPolicy.from_pretrained("lerobot/smolvla_base")
+# ```
+
+# """
+
+# import math
+# import os
+# import re
+# from collections import deque
+
+# import safetensors
+# import torch
+# import torch.nn.functional as F  # noqa: N812
+# from torch import Tensor, nn
+# from transformers import AutoProcessor
+
+# from lerobot.constants import ACTION
+# from lerobot.policies.normalize import (
+#     Normalize,
+#     Unnormalize,
+# )
+# from lerobot.policies.pretrained import PreTrainedPolicy
+# from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
+# from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel
+# from lerobot.policies.utils import (
+#     populate_queues,
+# )
+# from lerobot.utils.utils import get_safe_dtype
+# OBS_STATE = 'state'
+# ACTION = 'actions'
+# # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
+# _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
+
+
+# def canonicalise(k: str) -> str:
+#     """
+#     Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a
+#     normalisation-buffer key.
+#     """
+#     return _VARIANT_RE.sub(".buffer_", k)
+
+
+# def standardise_state_dict(
+#     checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True
+# ) -> tuple[dict[str, torch.Tensor], list[str]]:
+#     """
+#     • Re-keys `checkpoint ` so that every entry matches the *reference* key set.
+#     • If several variant keys collapse to the same canonical name we keep the
+#       first one and log the collision.
+#     • Returns the new dict + a list of entries that could not be matched.
+#     """
+#     out, collisions, unmatched = {}, {}, []
+
+#     for k, v in checkpoint.items():
+#         canon = canonicalise(k)
+#         if canon in ref_keys:
+#             if canon in out:  # duplicate after collapsing
+#                 collisions.setdefault(canon, []).append(k)
+#             else:
+#                 out[canon] = v
+#         else:
+#             unmatched.append(k)
+
+#     if verbose:
+#         for canon, variants in collisions.items():
+#             print(f"[standardise_state_dict] '{canon}'  ←  {variants}")
+#         if unmatched:
+#             print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys")
+
+#     out.update({k: checkpoint[k] for k in unmatched})
+#     return out, unmatched
+
+
+# def rename_checkpoint_keys(checkpoint: dict, rename_str: str):
+#     """
+#     Renames keys in a checkpoint dictionary based on the given rename string.
+
+#     Args:
+#         checkpoint (dict): The checkpoint dictionary.
+#         rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2".
+
+#     Returns:
+#         dict: The modified checkpoint with renamed keys.
+#     """
+
+#     rename_dict = dict(pair.split("//") for pair in rename_str.split(","))
+
+#     new_checkpoint = {}
+#     for k, v in checkpoint.items():
+#         for old_key, new_key in rename_dict.items():
+#             if old_key in k:
+#                 k = k.replace(old_key, new_key)
+#         new_checkpoint[k] = v
+#     return new_checkpoint
+
+
+# def load_smolvla(
+#     model: torch.nn.Module,
+#     filename: str | os.PathLike,
+#     *,
+#     device: str = "cpu",
+#     checkpoint_keys_mapping: str = "",
+# ) -> torch.nn.Module:
+#     state_dict = safetensors.torch.load_file(filename, device=device)
+
+#     # Optional user-supplied renames (e.g. "model._orig_mod.//model.")
+#     if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping:
+#         state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping)
+
+#     state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys()))
+
+#     # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset
+#     norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs")
+#     state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)}
+
+#     missing, unexpected = model.load_state_dict(state_dict, strict=False)
+
+#     if not all(key.startswith(norm_keys) for key in missing) or unexpected:
+#         raise RuntimeError(
+#             "SmolVLA %d missing / %d unexpected keys",
+#             len(missing),
+#             len(unexpected),
+#         )
+
+#     return model
+
+
+# def create_sinusoidal_pos_embedding(
+#     time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
+# ) -> Tensor:
+#     """Computes sine-cosine positional embedding vectors for scalar positions."""
+#     if dimension % 2 != 0:
+#         raise ValueError(f"dimension ({dimension}) must be divisible by 2")
+
+#     if time.ndim != 1:
+#         raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
+
+#     dtype = get_safe_dtype(torch.float64, device.type)
+#     fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
+#     period = min_period * (max_period / min_period) ** fraction
+
+#     # Compute the outer product
+#     scaling_factor = 1.0 / period * 2 * math.pi
+#     sin_input = scaling_factor[None, :] * time[:, None]
+#     pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
+#     return pos_emb
+
+
+# def make_att_2d_masks(pad_masks, att_masks):
+#     """Copied from big_vision.
+
+#     Tokens can attend to valid inputs tokens which have a cumulative mask_ar
+#     smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to
+#     setup several types of attention, for example:
+
+#       [[1 1 1 1 1 1]]: pure causal attention.
+
+#       [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
+#           themselves and the last 3 tokens have a causal attention. The first
+#           entry could also be a 1 without changing behaviour.
+
+#       [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
+#           block can attend all previous blocks and all tokens on the same block.
+
+#     Args:
+#       input_mask: bool[B, N] true if its part of the input, false if padding.
+#       mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on
+#         it and 0 where it shares the same attention mask as the previous token.
+#     """
+#     if att_masks.ndim != 2:
+#         raise ValueError(att_masks.ndim)
+#     if pad_masks.ndim != 2:
+#         raise ValueError(pad_masks.ndim)
+
+#     cumsum = torch.cumsum(att_masks, dim=1)
+#     att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
+#     pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
+#     att_2d_masks = att_2d_masks & pad_2d_masks
+#     return att_2d_masks
+
+
+# def resize_with_pad(img, width, height, pad_value=-1):
+#     # assume no-op when width height fits already
+#     if img.ndim != 4:
+#         raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
+
+#     cur_height, cur_width = img.shape[2:]
+
+#     ratio = max(cur_width / width, cur_height / height)
+#     resized_height = int(cur_height / ratio)
+#     resized_width = int(cur_width / ratio)
+#     resized_img = F.interpolate(
+#         img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+#     )
+
+#     pad_height = max(0, int(height - resized_height))
+#     pad_width = max(0, int(width - resized_width))
+
+#     # pad on left and top of image
+#     padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
+#     return padded_img
+
+
+# def pad_vector(vector, new_dim):
+#     """Can be (batch_size x sequence_length x features_dimension)
+#     or (batch_size x features_dimension)
+#     """
+#     if vector.shape[-1] == new_dim:
+#         return vector
+#     shape = list(vector.shape)
+#     current_dim = shape[-1]
+#     shape[-1] = new_dim
+#     new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device)
+#     new_vector[..., :current_dim] = vector
+#     return new_vector
+
+
+# def normalize(x, min_val, max_val):
+#     return (x - min_val) / (max_val - min_val)
+
+
+# def unnormalize(x, min_val, max_val):
+#     return x * (max_val - min_val) + min_val
+
+
+# def safe_arcsin(value):
+#     # This ensures that the input stays within
+#     # [−1,1] to avoid invalid values for arcsin
+#     return torch.arcsin(torch.clamp(value, -1.0, 1.0))
+
+
+# def aloha_gripper_to_angular(value):
+#     # Aloha transforms the gripper positions into a linear space. The following code
+#     # reverses this transformation to be consistent with smolvla which is pretrained in
+#     # angular space.
+#     #
+#     # These values are coming from the Aloha code:
+#     # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED
+#     value = unnormalize(value, min_val=0.01844, max_val=0.05800)
+
+#     # This is the inverse of the angular to linear transformation inside the Interbotix code.
+#     def linear_to_radian(linear_position, arm_length, horn_radius):
+#         value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position)
+#         return safe_arcsin(value)
+
+#     # The constants are taken from the Interbotix code.
+#     value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022)
+
+#     # Normalize to [0, 1].
+#     # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+#     return normalize(value, min_val=0.4, max_val=1.5)
+
+
+# def aloha_gripper_from_angular(value):
+#     # Convert from the gripper position used by smolvla to the gripper position that is used by Aloha.
+#     # Note that the units are still angular but the range is different.
+
+#     # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+#     value = unnormalize(value, min_val=0.4, max_val=1.5)
+
+#     # These values are coming from the Aloha code:
+#     # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
+#     return normalize(value, min_val=-0.6213, max_val=1.4910)
+
+
+# def aloha_gripper_from_angular_inv(value):
+#     # Directly inverts the gripper_from_angular function.
+#     value = unnormalize(value, min_val=-0.6213, max_val=1.4910)
+#     return normalize(value, min_val=0.4, max_val=1.5)
+
+
+# class SmolVLAPolicy(PreTrainedPolicy):
+#     """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot."""
+
+#     config_class = SmolVLAConfig
+#     name = "smolvla"
+
+#     def __init__(
+#         self,
+#         config: SmolVLAConfig,
+#         dataset_stats: dict[str, dict[str, Tensor]] | None = None,
+#     ):
+#         """
+#         Args:
+#             config: Policy configuration class instance or None, in which case the default instantiation of
+#                     the configuration class is used.
+#             dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
+#                 that they will be passed with a call to `load_state_dict` before the policy is used.
+#         """
+
+#         super().__init__(config)
+#         config.validate_features()
+#         self.config = config
+#         self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
+#         self.normalize_targets = Normalize(
+#             config.output_features, config.normalization_mapping, dataset_stats
+#         )
+#         self.unnormalize_outputs = Unnormalize(
+#             config.output_features, config.normalization_mapping, dataset_stats
+#         )
+
+#         self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer
+#         self.model = VLAFlowMatching(config)
+#         self.reset()
+
+#     def reset(self):
+#         """This should be called whenever the environment is reset."""
+#         self._queues = {
+#             ACTION: deque(maxlen=self.config.n_action_steps),
+#         }
+
+#     # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues
+#     @classmethod
+#     def _load_as_safetensor(
+#         cls,
+#         model: "SmolVLAPolicy",
+#         model_file: str,
+#         map_location: str,
+#         strict: bool,
+#     ):
+#         safetensors.torch.load_model(model, model_file, strict=strict, device=map_location)
+#         return load_smolvla(
+#             model,
+#             model_file,
+#             device=map_location,
+#             checkpoint_keys_mapping="model._orig_mod.//model.",
+#         )
+
+#     def get_optim_params(self) -> dict:
+#         return self.parameters()
+
+#     def _get_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+#         # TODO: Check if this for loop is needed.
+#         # Context: In fact, self.queues contains only ACTION field, and in inference, we don't have action in the batch
+#         # In the case of offline inference, we have the action in the batch
+#         # that why without the k != ACTION check, it will raise an error because we are trying to stack
+#         # on an empty container.
+#         for k in batch:
+#             if k in self._queues and k != ACTION:
+#                 batch[k] = torch.stack(list(self._queues[k]), dim=1)
+
+#         images, img_masks = self.prepare_images(batch)
+#         state = self.prepare_state(batch)
+#         lang_tokens, lang_masks = self.prepare_language(batch)
+
+#         actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise)
+
+#         # Unpad actions
+#         original_action_dim = self.config.action_feature.shape[0]
+#         actions = actions[:, :, :original_action_dim]
+
+#         actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
+
+#         if self.config.adapt_to_pi_aloha:
+#             actions = self._pi_aloha_encode_actions(actions)
+
+#         return actions
+
+#     def _prepare_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+#         if self.config.adapt_to_pi_aloha:
+#             batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+
+#         batch = self.normalize_inputs(batch)
+
+#         return batch
+
+#     @torch.no_grad()
+#     def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+#         self.eval()
+
+#         batch = self._prepare_batch(batch)
+#         self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
+
+#         actions = self._get_action_chunk(batch, noise)
+#         return actions
+
+#     @torch.no_grad()
+#     def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+#         """Select a single action given environment observations.
+
+#         This method wraps `select_actions` in order to return one action at a time for execution in the
+#         environment. It works by managing the actions in a queue and only calling `select_actions` when the
+#         queue is empty.
+#         """
+#         self.eval()
+#         batch = self._prepare_batch(batch)
+#         self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
+
+#         # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
+#         # querying the policy.
+#         if len(self._queues[ACTION]) == 0:
+#             actions = self._get_action_chunk(batch, noise)
+
+#             # `self.predict_action_chunk` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue
+#             # effectively has shape (n_action_steps, batch_size, *), hence the transpose.
+#             self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps])
+
+#         return self._queues[ACTION].popleft()
+
+#     def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]:
+#         """Do a full training forward pass to compute the loss"""
+#         if self.config.adapt_to_pi_aloha:
+#             batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+#             batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
+#         batch = self.normalize_inputs(batch)
+#         batch = self.normalize_targets(batch)
+#         images, img_masks = self.prepare_images(batch)
+#         state = self.prepare_state(batch)
+#         lang_tokens, lang_masks = self.prepare_language(batch)
+#         actions = self.prepare_action(batch)
+#         actions_is_pad = batch.get("actions_id_pad")
+#         loss_dict = {}
+#         losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
+#         loss_dict["losses_after_forward"] = losses.clone()
+
+#         if actions_is_pad is not None:
+#             in_episode_bound = ~actions_is_pad
+#             losses = losses * in_episode_bound.unsqueeze(-1)
+#             loss_dict["losses_after_in_ep_bound"] = losses.clone()
+
+#         # Remove padding
+#         losses = losses[:, :, : self.config.max_action_dim]
+#         loss_dict["losses_after_rm_padding"] = losses.clone()
+
+#         # For backward pass
+#         loss = losses.mean()
+#         # For backward pass
+#         loss_dict["loss"] = loss.item()
+#         return loss, loss_dict
+
+#     def prepare_images(self, batch):
+#         """Apply SmolVLA preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and
+#         convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP.
+#         """
+#         images = []
+#         img_masks = []
+#         present_img_keys = [key for key in self.config.image_features if key in batch]
+#         missing_img_keys = [key for key in self.config.image_features if key not in batch]
+
+#         if len(present_img_keys) == 0:
+#             raise ValueError(
+#                 f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})"
+#             )
+#         # Preprocess image features present in the batch
+#         for key in present_img_keys:
+#             img = batch[key][:, -1, :, :, :] if batch[key].ndim == 5 else batch[key]
+#             if self.config.resize_imgs_with_padding is not None:
+#                 img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0)
+
+#             # Normalize from range [0,1] to [-1,1] as expacted by siglip
+#             img = img * 2.0 - 1.0
+
+#             bsize = img.shape[0]
+#             device = img.device
+#             if f"{key}_padding_mask" in batch:
+#                 mask = batch[f"{key}_padding_mask"].bool()
+#             else:
+#                 mask = torch.ones(bsize, dtype=torch.bool, device=device)
+#             images.append(img)
+#             img_masks.append(mask)
+
+#         # Create image features not present in the batch
+#         # as fully 0 padded images.
+#         for num_empty_cameras in range(len(missing_img_keys)):
+#             if num_empty_cameras >= self.config.empty_cameras:
+#                 break
+#             img = torch.ones_like(img) * -1
+#             mask = torch.zeros_like(mask)
+#             images.append(img)
+#             img_masks.append(mask)
+#         return images, img_masks
+
+#     def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
+#         """Tokenize the text input"""
+#         device = batch[OBS_STATE].device
+#         tasks = batch["task"]
+#         if isinstance(tasks, str):
+#             tasks = [tasks]
+
+#         if len(tasks) == 1:
+#             tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])]
+
+#         tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
+
+#         tokenized_prompt = self.language_tokenizer.__call__(
+#             tasks,
+#             padding=self.config.pad_language_to,
+#             padding_side="right",
+#             max_length=self.config.tokenizer_max_length,
+#             return_tensors="pt",
+#         )
+#         lang_tokens = tokenized_prompt["input_ids"].to(device=device)
+#         lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool)
+
+#         return lang_tokens, lang_masks
+
+#     def _pi_aloha_decode_state(self, state):
+#         # Flip the joints.
+#         for motor_idx in [1, 2, 8, 9]:
+#             state[:, motor_idx] *= -1
+#         # Reverse the gripper transformation that is being applied by the Aloha runtime.
+#         for motor_idx in [6, 13]:
+#             state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx])
+#         return state
+
+#     def _pi_aloha_encode_actions(self, actions):
+#         # Flip the joints.
+#         for motor_idx in [1, 2, 8, 9]:
+#             actions[:, :, motor_idx] *= -1
+#         # Reverse the gripper transformation that is being applied by the Aloha runtime.
+#         for motor_idx in [6, 13]:
+#             actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx])
+#         return actions
+
+#     def _pi_aloha_encode_actions_inv(self, actions):
+#         # Flip the joints again.
+#         for motor_idx in [1, 2, 8, 9]:
+#             actions[:, :, motor_idx] *= -1
+#         # Reverse the gripper transformation that is being applied by the Aloha runtime.
+#         for motor_idx in [6, 13]:
+#             actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx])
+#         return actions
+
+#     def prepare_state(self, batch):
+#         """Pad state"""
+#         state = batch[OBS_STATE][:, -1, :] if batch[OBS_STATE].ndim > 2 else batch[OBS_STATE]
+#         state = pad_vector(state, self.config.max_state_dim)
+#         return state
+
+#     def prepare_action(self, batch):
+#         """Pad action"""
+#         actions = pad_vector(batch[ACTION], self.config.max_action_dim)
+#         return actions
+
+
+# def pad_tensor(tensor, max_len, pad_value=0):
+#     """
+#     Efficiently pads a tensor along sequence dimension to match max_len.
+
+#     Args:
+#         tensor (torch.Tensor): Shape (B, L, ...) or (B, L).
+#         max_len (int): Fixed sequence length.
+#         pad_value (int/float): Value for padding.
+
+#     Returns:
+#         torch.Tensor: Shape (B, max_len, ...) or (B, max_len).
+#     """
+#     b, d = tensor.shape[:2]
+
+#     # Create a padded tensor of max_len and copy the existing values
+#     padded_tensor = torch.full(
+#         (b, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device
+#     )
+#     padded_tensor[:, :d] = tensor  # Efficient in-place copy
+
+#     return padded_tensor
+
+
+# class VLAFlowMatching(nn.Module):
+#     """
+#     SmolVLA
+
+#     [Paper]()
+
+#     Designed by Hugging Face.
+#     ┌──────────────────────────────┐
+#     │                 actions      │
+#     │                    ▲         │
+#     │ ┌─────────┐      ┌─|────┐    │
+#     │ |         │────► │      │    │
+#     │ |         │ kv   │      │    │
+#     │ |         │────► │Action│    │
+#     │ |   VLM   │cache │Expert│    |
+#     │ │         │────► |      │    │
+#     │ │         │      │      │    │
+#     │ └▲──▲───▲─┘      └───▲──┘    |
+#     │  │  |   |            │       |
+#     │  |  |   |          noise     │
+#     │  │  │ state                  │
+#     │  │ language tokens           │
+#     │  image(s)                    │
+#     └──────────────────────────────┘
+#     """
+
+#     def __init__(self, config: SmolVLAConfig):
+#         super().__init__()
+#         self.config = config
+
+#         self.vlm_with_expert = SmolVLMWithExpertModel(
+#             model_id=self.config.vlm_model_name,
+#             freeze_vision_encoder=self.config.freeze_vision_encoder,
+#             train_expert_only=self.config.train_expert_only,
+#             load_vlm_weights=self.config.load_vlm_weights,
+#             attention_mode=self.config.attention_mode,
+#             num_expert_layers=self.config.num_expert_layers,
+#             num_vlm_layers=self.config.num_vlm_layers,
+#             self_attn_every_n_layers=self.config.self_attn_every_n_layers,
+#             expert_width_multiplier=self.config.expert_width_multiplier,
+#         )
+#         self.state_proj = nn.Linear(
+#             self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size
+#         )
+#         self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size)
+#         self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim)
+
+#         self.action_time_mlp_in = nn.Linear(
+#             self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size
+#         )
+#         self.action_time_mlp_out = nn.Linear(
+#             self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size
+#         )
+
+#         self.set_requires_grad()
+#         self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id
+#         self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id
+#         self.global_image_start_token = torch.tensor(
+#             [self.fake_image_token, self.global_image_token], dtype=torch.long
+#         )
+
+#         self.add_image_special_tokens = self.config.add_image_special_tokens
+#         self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long)
+#         self.prefix_length = self.config.prefix_length
+
+#     def set_requires_grad(self):
+#         for params in self.state_proj.parameters():
+#             params.requires_grad = self.config.train_state_proj
+
+#     def sample_noise(self, shape, device):
+#         noise = torch.normal(
+#             mean=0.0,
+#             std=1.0,
+#             size=shape,
+#             dtype=torch.float32,
+#             device=device,
+#         )
+#         return noise
+
+#     def sample_time(self, bsize, device):
+#         beta_dist = torch.distributions.Beta(concentration1=1.5, concentration0=1.0)
+#         time_beta = beta_dist.sample((bsize,)).to(device=device, dtype=torch.float32)
+#         time = time_beta * 0.999 + 0.001
+#         return time
+
+#     def embed_prefix(
+#         self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None
+#     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+#         """Embed images with SigLIP and language tokens with embedding layer to prepare
+#         for SmolVLM transformer processing.
+#         """
+#         embs = []
+#         pad_masks = []
+#         att_masks = []
+#         for _img_idx, (
+#             img,
+#             img_mask,
+#         ) in enumerate(zip(images, img_masks, strict=False)):
+#             if self.add_image_special_tokens:
+#                 image_start_token = (
+#                     self.vlm_with_expert.embed_language_tokens(
+#                         self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)
+#                     )
+#                     .unsqueeze(0)
+#                     .expand(img.shape[0], -1, -1)
+#                 )
+#                 image_start_mask = torch.ones_like(
+#                     image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device
+#                 )
+#                 att_masks += [0] * (image_start_mask.shape[-1])
+#                 embs.append(image_start_token)
+#                 pad_masks.append(image_start_mask)
+
+#             img_emb = self.vlm_with_expert.embed_image(img)
+#             img_emb = img_emb
+
+#             # Normalize image embeddings
+#             img_emb_dim = img_emb.shape[-1]
+#             img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device)
+
+#             bsize, num_img_embs = img_emb.shape[:2]
+#             img_mask = img_mask[:, None].expand(bsize, num_img_embs)
+
+#             embs.append(img_emb)
+#             pad_masks.append(img_mask)
+
+#             att_masks += [0] * (num_img_embs)
+#             if self.add_image_special_tokens:
+#                 image_end_token = (
+#                     self.vlm_with_expert.embed_language_tokens(
+#                         self.image_end_token.to(device=self.vlm_with_expert.vlm.device)
+#                     )
+#                     .unsqueeze(0)
+#                     .expand(img.shape[0], -1, -1)
+#                 )
+#                 image_end_mask = torch.ones_like(
+#                     image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device
+#                 )
+#                 embs.append(image_end_token)
+#                 pad_masks.append(image_end_mask)
+#                 att_masks += [0] * (image_end_mask.shape[1])
+#         lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens)
+#         # Normalize language embeddings
+#         lang_emb_dim = lang_emb.shape[-1]
+#         lang_emb = lang_emb * math.sqrt(lang_emb_dim)
+
+#         embs.append(lang_emb)
+#         pad_masks.append(lang_masks)
+
+#         num_lang_embs = lang_emb.shape[1]
+#         att_masks += [0] * num_lang_embs
+
+#         state_emb = self.state_proj(state)
+#         state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb
+#         embs.append(state_emb)
+#         bsize = state_emb.shape[0]
+#         device = state_emb.device
+
+#         states_seq_len = state_emb.shape[1]
+#         state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device)
+#         pad_masks.append(state_mask)
+
+#         # Set attention masks so that image and language inputs do not attend to state or actions
+#         att_masks += [1] * (states_seq_len)
+#         embs = torch.cat(embs, dim=1)
+#         pad_masks = torch.cat(pad_masks, dim=1)
+#         att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
+#         att_masks = att_masks[None, :]
+
+#         seq_len = pad_masks.shape[1]
+#         if seq_len < self.prefix_length:
+#             embs = pad_tensor(embs, self.prefix_length, pad_value=0)
+#             pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0)
+#             att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0)
+
+#         att_masks = att_masks.expand(bsize, -1)
+
+#         return embs, pad_masks, att_masks
+
+#     def embed_suffix(self, noisy_actions, timestep):
+#         """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing."""
+#         embs = []
+#         pad_masks = []
+#         att_masks = []
+
+#         # Fuse timestep + action information using an MLP
+#         action_emb = self.action_in_proj(noisy_actions)
+#         device = action_emb.device
+#         bsize = action_emb.shape[0]
+#         dtype = action_emb.dtype
+#         # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
+#         time_emb = create_sinusoidal_pos_embedding(
+#             timestep,
+#             self.vlm_with_expert.expert_hidden_size,
+#             self.config.min_period,
+#             self.config.max_period,
+#             device=device,
+#         )
+#         time_emb = time_emb.type(dtype=dtype)
+
+#         time_emb = time_emb[:, None, :].expand_as(action_emb)
+#         action_time_emb = torch.cat([action_emb, time_emb], dim=2)
+
+#         action_time_emb = self.action_time_mlp_in(action_time_emb)
+#         action_time_emb = F.silu(action_time_emb)  # swish == silu
+#         action_time_emb = self.action_time_mlp_out(action_time_emb)
+
+#         # Add to input tokens
+#         embs.append(action_time_emb)
+
+#         bsize, action_time_dim = action_time_emb.shape[:2]
+#         action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device)
+#         pad_masks.append(action_time_mask)
+
+#         # Set attention masks so that image, language and state inputs do not attend to action tokens
+#         att_masks += [1] * self.config.chunk_size
+#         embs = torch.cat(embs, dim=1)
+#         pad_masks = torch.cat(pad_masks, dim=1)
+#         att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
+#         att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+#         # added by jade
+#         seq_len = pad_masks.shape[1]
+#         if seq_len < self.config.chunk_size:
+#             embs = pad_tensor(embs, self.config.chunk_size, pad_value=0)
+#             pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0)
+#             att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0)
+#         return embs, pad_masks, att_masks
+
+#     def forward(
+#         self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
+#     ) -> Tensor:
+#         """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
+#         #added by jade
+#         if actions.ndim == 2:
+#             actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1)
+#         if noise is None:
+#             noise = self.sample_noise(actions.shape, actions.device)
+
+#         if time is None:
+#             time = self.sample_time(actions.shape[0], actions.device)
+
+#         time_expanded = time[:, None, None]
+#         x_t = time_expanded * noise + (1 - time_expanded) * actions
+#         u_t = noise - actions
+#         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+#             images, img_masks, lang_tokens, lang_masks, state=state
+#         )
+#         suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, time)
+
+#         pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
+#         att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
+
+#         att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+#         position_ids = torch.cumsum(pad_masks, dim=1) - 1
+#         (_, suffix_out), _ = self.vlm_with_expert.forward(
+#             attention_mask=att_2d_masks,
+#             position_ids=position_ids,
+#             past_key_values=None,
+#             inputs_embeds=[prefix_embs, suffix_embs],
+#             use_cache=False,
+#             fill_kv_cache=False,
+#         )
+#         # suffix_out = suffix_out[:, -self.config.chunk_size :]
+#         suffix_out = suffix_out[:, -self.config.chunk_size:, :]
+#         # Original openpi code, upcast attention output
+#         suffix_out = suffix_out.to(dtype=torch.float32)
+#         v_t = self.action_out_proj(suffix_out)
+#         losses = F.mse_loss(u_t, v_t, reduction="none")
+#         return losses
+
+#     def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor:
+#         """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
+#         bsize = state.shape[0]
+#         device = state.device
+
+#         if noise is None:
+#             actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim)
+#             noise = self.sample_noise(actions_shape, device)
+
+#         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+#             images, img_masks, lang_tokens, lang_masks, state=state
+#         )
+#         prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
+#         prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
+#         # Compute image and language key value cache
+#         _, past_key_values = self.vlm_with_expert.forward(
+#             attention_mask=prefix_att_2d_masks,
+#             position_ids=prefix_position_ids,
+#             past_key_values=None,
+#             inputs_embeds=[prefix_embs, None],
+#             use_cache=self.config.use_cache,
+#             fill_kv_cache=True,
+#         )
+#         dt = -1.0 / self.config.num_steps
+#         dt = torch.tensor(dt, dtype=torch.float32, device=device)
+
+#         x_t = noise
+#         time = torch.tensor(1.0, dtype=torch.float32, device=device)
+#         while time >= -dt / 2:
+#             expanded_time = time.expand(bsize)
+#             v_t = self.denoise_step(
+#                 prefix_pad_masks,
+#                 past_key_values,
+#                 x_t,
+#                 expanded_time,
+#             )
+#             # Euler step
+#             x_t += dt * v_t
+#             time += dt
+#         return x_t
+
+#     def denoise_step(
+#         self,
+#         prefix_pad_masks,
+#         past_key_values,
+#         x_t,
+#         timestep,
+#     ):
+#         """Apply one denoising step of the noise `x_t` at a given timestep."""
+#         suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, timestep)
+
+#         suffix_len = suffix_pad_masks.shape[1]
+#         batch_size = prefix_pad_masks.shape[0]
+#         prefix_len = prefix_pad_masks.shape[1]
+#         prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)
+
+#         suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
+
+#         full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
+#         prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
+#         position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
+
+#         outputs_embeds, _ = self.vlm_with_expert.forward(
+#             attention_mask=full_att_2d_masks,
+#             position_ids=position_ids,
+#             past_key_values=past_key_values,
+#             inputs_embeds=[None, suffix_embs],
+#             use_cache=self.config.use_cache,
+#             fill_kv_cache=False,
+#         )
+#         suffix_out = outputs_embeds[1]
+#         suffix_out = suffix_out[:, -self.config.chunk_size :]
+#         suffix_out = suffix_out.to(dtype=torch.float32)
+#         v_t = self.action_out_proj(suffix_out)
+#         return v_t
 #!/usr/bin/env python
 
 # Copyright 2025 HuggingFace Inc. team. All rights reserved.
@@ -63,7 +1015,7 @@ import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn
 from transformers import AutoProcessor
 
-from lerobot.constants import ACTION
+from lerobot.constants import ACTION, OBS_STATE
 from lerobot.policies.normalize import (
     Normalize,
     Unnormalize,
@@ -75,8 +1027,7 @@ from lerobot.policies.utils import (
     populate_queues,
 )
 from lerobot.utils.utils import get_safe_dtype
-OBS_STATE = 'state'
-ACTION = 'actions'
+
 # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
 _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
 
@@ -825,21 +1776,12 @@ class VLAFlowMatching(nn.Module):
         pad_masks = torch.cat(pad_masks, dim=1)
         att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
         att_masks = att_masks[None, :].expand(bsize, len(att_masks))
-        # added by jade
-        seq_len = pad_masks.shape[1]
-        if seq_len < self.config.chunk_size:
-            embs = pad_tensor(embs, self.config.chunk_size, pad_value=0)
-            pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0)
-            att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0)
         return embs, pad_masks, att_masks
 
     def forward(
         self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
     ) -> Tensor:
         """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
-        #added by jade
-        if actions.ndim == 2:
-            actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1)
         if noise is None:
             noise = self.sample_noise(actions.shape, actions.device)
 
@@ -867,8 +1809,7 @@ class VLAFlowMatching(nn.Module):
             use_cache=False,
             fill_kv_cache=False,
         )
-        # suffix_out = suffix_out[:, -self.config.chunk_size :]
-        suffix_out = suffix_out[:, -self.config.chunk_size:, :]
+        suffix_out = suffix_out[:, -self.config.chunk_size :]
         # Original openpi code, upcast attention output
         suffix_out = suffix_out.to(dtype=torch.float32)
         v_t = self.action_out_proj(suffix_out)
@@ -949,4 +1890,4 @@ class VLAFlowMatching(nn.Module):
         suffix_out = suffix_out[:, -self.config.chunk_size :]
         suffix_out = suffix_out.to(dtype=torch.float32)
         v_t = self.action_out_proj(suffix_out)
-        return v_t
+        return v_t
\ No newline at end of file
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla_v2.py b/src/lerobot/policies/smolvla/modeling_smolvla_v2.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/lerobot/policies/smolvla/saver.txt b/src/lerobot/policies/smolvla/saver.txt
new file mode 100644
index 000000000..3410062ba
--- /dev/null
+++ b/src/lerobot/policies/smolvla/saver.txt
@@ -0,0 +1 @@
+c
\ No newline at end of file
diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
index f6a49dccf..e4cd7acac 100644
--- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py
+++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
@@ -77,8 +77,8 @@ class SmolVLMWithExpertModel(nn.Module):
             self.vlm = AutoModelForImageTextToText.from_pretrained(
                 model_id,
                 device_map="auto",
-                # torch_dtype="bfloat16",
-                torch_dtype=torch.float16,
+                torch_dtype="bfloat16",
+                # torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
             )
             config = self.vlm.config
@@ -547,4 +547,4 @@ class SmolVLMWithExpertModel(nn.Module):
         # we use -1 because sequence length can change
         att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim)
 
-        return att_output
+        return att_output
\ No newline at end of file
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index 3145bed35..92a3bf833 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -458,6 +458,43 @@ def _compile_episode_data(
     data_dict["index"] = torch.arange(start_data_index, start_data_index + total_frames, 1)
 
     return data_dict
+from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
+from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
+def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
+    """Recreate normalization layers with proper stats from the dataset."""
+    from lerobot.policies.normalize import Normalize, Unnormalize
+
+    # Convert numpy stats to the format expected by normalization layers
+    stats = {}
+    for key, stat_dict in dataset_meta.stats.items():
+        stats[key] = {
+            stat_type: torch.from_numpy(stat_array) if isinstance(stat_array, np.ndarray) else stat_array
+            for stat_type, stat_array in stat_dict.items()
+        }
+
+    # Recreate normalization layers with proper stats
+    normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
+
+    normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+
+    unnormalize_outputs = Unnormalize(
+        policy.config.output_features, policy.config.normalization_mapping, stats
+    )
+
+    # Replace the normalization layers on the policy
+    policy.normalize_inputs = normalize_inputs
+    policy.normalize_targets = normalize_targets
+    policy.unnormalize_outputs = unnormalize_outputs
+
+    print("Normalization layers recreated with dataset stats.")
+
+
+def load_smolvla(cfg, dataset_repo: str):
+    from lerobot.datasets.lerobot_dataset import LeRobotDataset
+    dataset = LeRobotDataset(dataset_repo, root='/raid/jade/.cache/huggingface/datasets/')
+    policy = make_policy(cfg=cfg, ds_meta=dataset.meta)
+    _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta)  # only needed if stats are missing
+    return policy, dataset
 
 
 @parser.wrap()
@@ -466,7 +503,9 @@ def eval_main(cfg: EvalPipelineConfig):
 
     # Check device is available
     device = get_safe_torch_device(cfg.policy.device, log=True)
-
+    #login to hf
+    from huggingface_hub import login
+    login()
     torch.backends.cudnn.benchmark = True
     torch.backends.cuda.matmul.allow_tf32 = True
     set_seed(cfg.seed)
@@ -481,6 +520,9 @@ def eval_main(cfg: EvalPipelineConfig):
         cfg=cfg.policy,
         env_cfg=cfg.env,
     )
+    # breakpoint()
+    load_smolvla(cfg.policy, "physical-intelligence/libero")
+    # breakpoint()
     policy.eval()
     with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext():
         if cfg.env.multitask_eval:
diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py
index d5d562518..74219fc38 100644
--- a/src/lerobot/scripts/train.py
+++ b/src/lerobot/scripts/train.py
@@ -104,6 +104,32 @@ def update_policy(
     train_metrics.update_s = time.perf_counter() - start_time
     return train_metrics, output_dict
 
+# def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
+#     """Recreate normalization layers with dataset stats if missing (Adil's workaround)."""
+#     from lerobot.policies.normalize import Normalize, Unnormalize
+
+#     if not hasattr(dataset_meta, "stats") or not dataset_meta.stats:
+#         print("⚠️ Dataset has no stats, skipping normalization injection.")
+#         return
+
+#     stats = {}
+#     for key, stat_dict in dataset_meta.stats.items():
+#         stats[key] = {
+#             stat_type: torch.as_tensor(stat_array)
+#             if isinstance(stat_array, np.ndarray)
+#             else stat_array
+#             for stat_type, stat_array in stat_dict.items()
+#         }
+
+#     normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
+#     normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+#     unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+
+#     policy.normalize_inputs = normalize_inputs
+#     policy.normalize_targets = normalize_targets
+#     policy.unnormalize_outputs = unnormalize_outputs
+
+#     print("✅ Normalization layers injected with dataset stats.")
 
 @parser.wrap()
 def train(cfg: TrainPipelineConfig):
@@ -126,7 +152,6 @@ def train(cfg: TrainPipelineConfig):
 
     logging.info("Creating dataset")
     dataset = make_dataset(cfg)
-
     # Create environment used for evaluating checkpoints during training on simulation data.
     # On real-world data, no need to create an environment as evaluations are done outside train.py,
     # using the eval.py instead, with gym_dora environment and dora-rs.
@@ -140,7 +165,6 @@ def train(cfg: TrainPipelineConfig):
         cfg=cfg.policy,
         ds_meta=dataset.meta,
     )
-
     logging.info("Creating optimizer and scheduler")
     optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
     grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp)
@@ -203,7 +227,6 @@ def train(cfg: TrainPipelineConfig):
         start_time = time.perf_counter()
         batch = next(dl_iter)
         train_tracker.dataloading_s = time.perf_counter() - start_time
-
         for key in batch:
             if isinstance(batch[key], torch.Tensor):
                 batch[key] = batch[key].to(device, non_blocking=device.type == "cuda")
diff --git a/src/lerobot/scripts/train_2.py b/src/lerobot/scripts/train_2.py
new file mode 100644
index 000000000..26a9e7aea
--- /dev/null
+++ b/src/lerobot/scripts/train_2.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import time
+from contextlib import nullcontext
+from pprint import pformat
+from typing import Any
+
+import torch
+from termcolor import colored
+from torch.amp import GradScaler
+from torch.optim import Optimizer
+
+from lerobot.configs import parser
+from lerobot.configs.train import TrainPipelineConfig
+from lerobot.datasets.factory import make_dataset
+from lerobot.datasets.sampler import EpisodeAwareSampler
+from lerobot.datasets.utils import cycle
+from lerobot.envs.factory import make_env
+from lerobot.optim.factory import make_optimizer_and_scheduler
+from lerobot.policies.factory import make_policy
+from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.policies.utils import get_device_from_parameters
+from lerobot.scripts.eval import eval_policy, eval_policy_multitask
+from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
+from lerobot.utils.random_utils import set_seed
+from lerobot.utils.train_utils import (
+    get_step_checkpoint_dir,
+    get_step_identifier,
+    load_training_state,
+    save_checkpoint,
+    update_last_checkpoint,
+)
+from lerobot.utils.utils import (
+    format_big_number,
+    get_safe_torch_device,
+    has_method,
+    init_logging,
+)
+from lerobot.utils.wandb_utils import WandBLogger
+
+
+def update_policy(
+    train_metrics: MetricsTracker,
+    policy: PreTrainedPolicy,
+    batch: Any,
+    optimizer: Optimizer,
+    grad_clip_norm: float,
+    grad_scaler: GradScaler,
+    lr_scheduler=None,
+    use_amp: bool = False,
+    lock=None,
+) -> tuple[MetricsTracker, dict]:
+    start_time = time.perf_counter()
+    device = get_device_from_parameters(policy)
+    policy.train()
+    with torch.autocast(device_type=device.type) if use_amp else nullcontext():
+        loss, output_dict = policy.forward(batch)
+        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
+    grad_scaler.scale(loss).backward()
+
+    # Unscale the gradient of the optimizer's assigned params in-place **prior to gradient clipping**.
+    grad_scaler.unscale_(optimizer)
+
+    grad_norm = torch.nn.utils.clip_grad_norm_(
+        policy.parameters(),
+        grad_clip_norm,
+        error_if_nonfinite=False,
+    )
+
+    # Optimizer's gradients are already unscaled, so scaler.step does not unscale them,
+    # although it still skips optimizer.step() if the gradients contain infs or NaNs.
+    with lock if lock is not None else nullcontext():
+        grad_scaler.step(optimizer)
+    # Updates the scale for next iteration.
+    grad_scaler.update()
+
+    optimizer.zero_grad()
+
+    # Step through pytorch scheduler at every batch instead of epoch
+    if lr_scheduler is not None:
+        lr_scheduler.step()
+
+    if has_method(policy, "update"):
+        # To possibly update an internal buffer (for instance an Exponential Moving Average like in TDMPC).
+        policy.update()
+
+    train_metrics.loss = loss.item()
+    train_metrics.grad_norm = grad_norm.item()
+    train_metrics.lr = optimizer.param_groups[0]["lr"]
+    train_metrics.update_s = time.perf_counter() - start_time
+    return train_metrics, output_dict
+
+def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
+    """Recreate normalization layers with dataset stats if missing (Adil's workaround)."""
+    from lerobot.policies.normalize import Normalize, Unnormalize
+
+    if not hasattr(dataset_meta, "stats") or not dataset_meta.stats:
+        print("⚠️ Dataset has no stats, skipping normalization injection.")
+        return
+
+    stats = {}
+    for key, stat_dict in dataset_meta.stats.items():
+        stats[key] = {
+            stat_type: torch.as_tensor(stat_array)
+            if isinstance(stat_array, np.ndarray)
+            else stat_array
+            for stat_type, stat_array in stat_dict.items()
+        }
+
+    normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
+    normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+    unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+
+    policy.normalize_inputs = normalize_inputs
+    policy.normalize_targets = normalize_targets
+    policy.unnormalize_outputs = unnormalize_outputs
+
+    print("✅ Normalization layers injected with dataset stats.")
+
+@parser.wrap()
+def train(cfg: TrainPipelineConfig):
+    cfg.validate()
+    logging.info(pformat(cfg.to_dict()))
+
+    if cfg.wandb.enable and cfg.wandb.project:
+        wandb_logger = WandBLogger(cfg)
+    else:
+        wandb_logger = None
+        logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"]))
+
+    if cfg.seed is not None:
+        set_seed(cfg.seed)
+
+    # Check device is available
+    device = get_safe_torch_device(cfg.policy.device, log=True)
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    logging.info("Creating dataset")
+    dataset = make_dataset(cfg)
+
+    # Create environment used for evaluating checkpoints during training on simulation data.
+    # On real-world data, no need to create an environment as evaluations are done outside train.py,
+    # using the eval.py instead, with gym_dora environment and dora-rs.
+    eval_env = None
+    if cfg.eval_freq > 0 and cfg.env is not None:
+        logging.info("Creating env")
+        eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
+
+    logging.info("Creating policy")
+    policy = make_policy(
+        cfg=cfg.policy,
+        ds_meta=dataset.meta,
+    )
+    logging.info("Creating optimizer and scheduler")
+    optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
+    grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp)
+
+    step = 0  # number of policy updates (forward + backward + optim)
+
+    if cfg.resume:
+        step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler)
+
+    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+    num_total_params = sum(p.numel() for p in policy.parameters())
+
+    logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")
+    if cfg.env is not None:
+        logging.info(f"{cfg.env.task=}")
+    logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
+    logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
+    logging.info(f"{dataset.num_episodes=}")
+    logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
+    logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
+
+    # create dataloader for offline training
+    if hasattr(cfg.policy, "drop_n_last_frames"):
+        shuffle = False
+        sampler = EpisodeAwareSampler(
+            dataset.episode_data_index,
+            drop_n_last_frames=cfg.policy.drop_n_last_frames,
+            shuffle=True,
+        )
+    else:
+        shuffle = True
+        sampler = None
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=cfg.num_workers,
+        batch_size=cfg.batch_size,
+        shuffle=shuffle,
+        sampler=sampler,
+        pin_memory=device.type == "cuda",
+        drop_last=False,
+    )
+    dl_iter = cycle(dataloader)
+
+    policy.train()
+    train_metrics = {
+        "loss": AverageMeter("loss", ":.3f"),
+        "grad_norm": AverageMeter("grdn", ":.3f"),
+        "lr": AverageMeter("lr", ":0.1e"),
+        "update_s": AverageMeter("updt_s", ":.3f"),
+        "dataloading_s": AverageMeter("data_s", ":.3f"),
+    }
+
+    train_tracker = MetricsTracker(
+        cfg.batch_size, dataset.num_frames, dataset.num_episodes, train_metrics, initial_step=step
+    )
+
+    logging.info("Start offline training on a fixed dataset")
+    for _ in range(step, cfg.steps):
+        start_time = time.perf_counter()
+        batch = next(dl_iter)
+        train_tracker.dataloading_s = time.perf_counter() - start_time
+
+        for key in batch:
+            if isinstance(batch[key], torch.Tensor):
+                batch[key] = batch[key].to(device, non_blocking=device.type == "cuda")
+
+        train_tracker, output_dict = update_policy(
+            train_tracker,
+            policy,
+            batch,
+            optimizer,
+            cfg.optimizer.grad_clip_norm,
+            grad_scaler=grad_scaler,
+            lr_scheduler=lr_scheduler,
+            use_amp=cfg.policy.use_amp,
+        )
+
+        # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
+        # increment `step` here.
+        step += 1
+        train_tracker.step()
+        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0
+        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
+        is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0
+
+        if is_log_step:
+            logging.info(train_tracker)
+            if wandb_logger:
+                wandb_log_dict = train_tracker.to_dict()
+                if output_dict:
+                    wandb_log_dict.update(output_dict)
+                wandb_logger.log_dict(wandb_log_dict, step)
+            train_tracker.reset_averages()
+
+        if cfg.save_checkpoint and is_saving_step:
+            logging.info(f"Checkpoint policy after step {step}")
+            checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step)
+            save_checkpoint(checkpoint_dir, step, cfg, policy, optimizer, lr_scheduler)
+            update_last_checkpoint(checkpoint_dir)
+            if wandb_logger:
+                wandb_logger.log_policy(checkpoint_dir)
+
+        if cfg.env and is_eval_step:
+            step_id = get_step_identifier(step, cfg.steps)
+            logging.info(f"Eval policy at step {step}")
+            with (
+                torch.no_grad(),
+                torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(),
+            ):
+                if cfg.env.multitask_eval:
+                    eval_info = eval_policy_multitask(
+                        eval_env,
+                        policy,
+                        cfg.eval.n_episodes,
+                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
+                        max_episodes_rendered=4,
+                        start_seed=cfg.seed,
+                        max_parallel_tasks=cfg.env.max_parallel_tasks,
+                    )
+                    aggregated = eval_info["overall"]["aggregated"]
+                    # Print per-suite stats, log?
+                    for task_group, task_group_info in eval_info.items():
+                        if task_group == "overall":
+                            continue  # Skip the overall stats since we already printed it
+                        print(f"\nAggregated Metrics for {task_group}:")
+                        print(task_group_info["aggregated"])
+                else:
+                    eval_info = eval_policy(
+                        eval_env,
+                        policy,
+                        cfg.eval.n_episodes,
+                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
+                        max_episodes_rendered=4,
+                        start_seed=cfg.seed,
+                    )
+                    aggregated = eval_info["aggregated"]
+
+            eval_metrics = {
+                "avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),
+                "pc_success": AverageMeter("success", ":.1f"),
+                "eval_s": AverageMeter("eval_s", ":.3f"),
+            }
+            eval_tracker = MetricsTracker(
+                cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step
+            )
+            eval_tracker.eval_s = aggregated.pop("eval_s")
+            eval_tracker.avg_sum_reward = aggregated.pop("avg_sum_reward")
+            eval_tracker.pc_success = aggregated.pop("pc_success")
+            logging.info(eval_tracker)
+            if wandb_logger:
+                wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
+                wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
+                wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval")
+
+    if eval_env:
+        if cfg.env.multitask_eval:
+            for _task_group, envs_dict in eval_env.items():
+                for _idx, env in envs_dict.items():
+                    env.close()
+        else:
+            eval_env.close()
+    logging.info("End of training")
+
+    if cfg.policy.push_to_hub:
+        policy.push_model_to_hub(cfg)
+
+
+def main():
+    init_logging()
+    train()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/lerobot/scripts/train_accelerate.py b/src/lerobot/scripts/train_accelerate.py
new file mode 100644
index 000000000..e205f138f
--- /dev/null
+++ b/src/lerobot/scripts/train_accelerate.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import time
+from pprint import pformat
+from typing import Any
+
+import torch
+from accelerate import Accelerator
+from accelerate.utils import set_seed as accelerate_set_seed
+from termcolor import colored
+from torch.optim import Optimizer
+
+from lerobot.datasets.factory import make_dataset
+from lerobot.datasets.sampler import EpisodeAwareSampler
+from lerobot.envs.factory import make_env
+from lerobot.optim.factory import make_optimizer_and_scheduler
+from lerobot.policies.factory import make_policy
+from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
+from lerobot.utils.train_utils import (
+    get_step_checkpoint_dir,
+    get_step_identifier,
+    load_training_state,
+    save_checkpoint,
+    update_last_checkpoint,
+)
+from lerobot.utils.utils import (
+    format_big_number,
+    has_method,
+    init_logging,
+)
+from lerobot.configs import parser
+from lerobot.configs.train import TrainPipelineConfig
+from lerobot.scripts.eval import eval_policy
+
+
+def update_policy(
+    train_metrics: MetricsTracker,
+    policy: PreTrainedPolicy,
+    batch: Any,
+    optimizer: Optimizer,
+    grad_clip_norm: float,
+    accelerator: Accelerator,
+    lr_scheduler=None,
+) -> tuple[MetricsTracker, dict]:
+    start_time = time.perf_counter()
+    policy.train()
+
+    # Use accelerator's autocast context if mixed precision is enabled
+    with accelerator.autocast():
+        loss, output_dict = policy.forward(batch)
+        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
+
+    # Use accelerator for backward pass
+    accelerator.backward(loss)
+
+    # Gradient clipping - accelerator handles unscaling automatically
+    if accelerator.sync_gradients and grad_clip_norm > 0:
+        grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
+    else:
+        grad_norm = torch.tensor(0.0)
+
+    optimizer.step()
+    lr_scheduler.step() if lr_scheduler is not None else None
+    optimizer.zero_grad()
+
+    # Update policy-specific buffers if needed
+    if has_method(policy, "update"):
+        policy.update()
+
+    # Gather metrics across all processes
+    loss_value = accelerator.gather(loss.detach()).mean().item()
+    grad_norm_value = accelerator.gather(grad_norm).mean().item()
+
+    train_metrics.loss = loss_value
+    train_metrics.grad_norm = grad_norm_value
+    train_metrics.lr = optimizer.param_groups[0]["lr"]
+    train_metrics.update_s = time.perf_counter() - start_time
+    return train_metrics, output_dict
+
+
+@parser.wrap()
+def train(cfg: TrainPipelineConfig):
+    cfg.validate()
+    logging.info(pformat(cfg.to_dict()))
+
+    # Initialize accelerator
+    from accelerate.utils import DistributedDataParallelKwargs
+    # added by jade 2 lines
+    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)
+    accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs])
+
+    from lerobot.utils.wandb_utils import cfg_to_group, get_wandb_run_id_from_filesystem
+
+    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        mixed_precision="bf16" if cfg.policy.use_amp else "no",
+        gradient_accumulation_steps=cfg.policy.gradient_accumulation_steps,
+        log_with="wandb" if cfg.wandb.enable else None,
+        kwargs_handlers=[ddp_kwargs],
+        project_dir=cfg.output_dir,
+    )
+
+    accelerator.init_trackers(
+        project_name=cfg.wandb.project,
+        init_kwargs={
+            "wandb": {
+                "entity": cfg.wandb.entity,
+                "name": cfg.job_name,
+                "notes": cfg.wandb.notes,
+                "tags": cfg_to_group(cfg, return_list=True),
+                "dir": cfg.output_dir,
+                "config": cfg.to_dict(),
+                "save_code": False,
+                "job_type": "train_eval",
+                "mode": cfg.wandb.mode if cfg.wandb.mode in ["online", "offline", "disabled"] else "online",
+                "resume": "must" if cfg.resume else None,
+                "id": cfg.wandb.run_id
+                if cfg.wandb.run_id
+                else (get_wandb_run_id_from_filesystem(cfg.output_dir) if cfg.resume else None),
+            }
+        },
+    )
+
+    # Set seed for reproducibility
+    if cfg.seed is not None:
+        accelerate_set_seed(cfg.seed)
+
+    # Setup device - accelerator handles device placement
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    # Create dataset
+    if accelerator.is_main_process:
+        logging.info("Creating dataset")
+    dataset = make_dataset(cfg)
+    print("c")
+    # Create evaluation environment (only on main process)
+    eval_env = None
+    if cfg.eval_freq > 0 and cfg.env is not None and accelerator.is_main_process:
+        logging.info("Creating env")
+        eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
+
+    # Create policy
+    if accelerator.is_main_process:
+        logging.info("Creating policy")
+
+    # Use accelerator's device instead of cfg.policy.device
+    with accelerator.main_process_first():
+        policy = make_policy(
+            cfg=cfg.policy,
+            ds_meta=dataset.meta,
+        )
+
+    # Create optimizer and scheduler
+    if accelerator.is_main_process:
+        logging.info("Creating optimizer and scheduler")
+    optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
+
+    step = 0  # number of policy updates
+
+    if cfg.resume:
+        step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler)
+
+    # Prepare dataloader
+    if hasattr(cfg.policy, "drop_n_last_frames"):
+        shuffle = False
+        sampler = EpisodeAwareSampler(
+            dataset.episode_data_index,
+            drop_n_last_frames=cfg.policy.drop_n_last_frames,
+            shuffle=True,
+        )
+    else:
+        shuffle = True
+        sampler = None
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=cfg.num_workers,
+        batch_size=cfg.batch_size,
+        shuffle=shuffle,
+        sampler=sampler,
+        pin_memory=True,
+        drop_last=True,  # Important for distributed training
+    )
+
+    # Prepare for distributed training
+    policy, optimizer, dataloader, lr_scheduler = accelerator.prepare(
+        policy, optimizer, dataloader, lr_scheduler
+    )
+
+    # Log training info (only on main process)
+    if accelerator.is_main_process:
+        num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+        num_total_params = sum(p.numel() for p in policy.parameters())
+
+        logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")
+        if cfg.env is not None:
+            logging.info(f"{cfg.env.task=}")
+        logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
+        logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
+        logging.info(f"{dataset.num_episodes=}")
+        logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
+        logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
+        logging.info(f"Number of processes: {accelerator.num_processes}")
+        logging.info(f"Device: {accelerator.device}")
+        logging.info(f"Mixed precision: {accelerator.mixed_precision}")
+
+    # Create metrics trackers
+    train_metrics = {
+        "loss": AverageMeter("loss", ":.3f"),
+        "grad_norm": AverageMeter("grdn", ":.3f"),
+        "lr": AverageMeter("lr", ":0.1e"),
+        "update_s": AverageMeter("updt_s", ":.3f"),
+        "dataloading_s": AverageMeter("data_s", ":.3f"),
+    }
+
+    train_tracker = MetricsTracker(
+        cfg.batch_size * accelerator.num_processes,  # Account for all processes
+        dataset.num_frames,
+        dataset.num_episodes,
+        train_metrics,
+        initial_step=step,
+    )
+
+    # Training loop
+    policy.train()
+    if accelerator.is_main_process:
+        logging.info("Start offline training on a fixed dataset")
+
+    # Create iterator from dataloader
+    dl_iter = iter(dataloader)
+
+    for current_step in range(step, cfg.steps):
+        start_time = time.perf_counter()
+        # Get next batch, cycling through dataloader if needed
+        try:
+            batch = next(dl_iter)
+            print("data laoder batch keys: ", batch.keys())
+            breakpoint()
+        except StopIteration:
+            dl_iter = iter(dataloader)
+            batch = next(dl_iter)
+        train_tracker.dataloading_s = time.perf_counter() - start_time
+        # Update policy
+        train_tracker, output_dict = update_policy(
+            train_tracker,
+            policy,
+            batch,
+            optimizer,
+            cfg.optimizer.grad_clip_norm,
+            accelerator,
+            lr_scheduler=lr_scheduler,
+        )
+
+        # Increment step counter
+        step += 1
+        train_tracker.step()
+
+        # Determine if we should log, save, or evaluate
+        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0
+        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
+        is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0
+
+        # Logging (only on main process)
+        if is_log_step and accelerator.is_main_process:
+            logging.info(train_tracker)
+            wandb_log_dict = train_tracker.to_dict()
+            if output_dict:
+                wandb_log_dict.update(output_dict)
+            for k, v in wandb_log_dict.items():
+                accelerator.log({f"{'train'}/{k}": v}, step=step)
+            train_tracker.reset_averages()
+
+        # Checkpointing (only on main process)
+        if cfg.save_checkpoint and is_saving_step:
+            # ✅ all processes wait here
+            accelerator.wait_for_everyone()
+
+            if accelerator.is_main_process:
+                logging.info(f"Checkpoint policy after step {step}")
+                checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step)
+
+                unwrapped_policy = accelerator.unwrap_model(policy)
+                save_checkpoint(checkpoint_dir, step, cfg, unwrapped_policy, optimizer, lr_scheduler)
+                update_last_checkpoint(checkpoint_dir)
+
+            # ✅ all processes sync again after saving
+            accelerator.wait_for_everyone()
+
+            # if wandb_logger:
+            #     wandb_logger.log_policy(checkpoint_dir)
+
+        # Evaluation (only on main process)
+        if cfg.env and is_eval_step and accelerator.is_main_process:
+            step_id = get_step_identifier(step, cfg.steps)
+            logging.info(f"Eval policy at step {step}")
+
+            # Unwrap model for evaluation
+            unwrapped_policy = accelerator.unwrap_model(policy)
+            unwrapped_policy.eval()
+
+            with torch.no_grad():
+                eval_info = eval_policy(
+                    eval_env,
+                    unwrapped_policy,
+                    cfg.eval.n_episodes,
+                    videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
+                    max_episodes_rendered=4,
+                    start_seed=cfg.seed,
+                )
+
+            eval_metrics = {
+                "avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),
+                "pc_success": AverageMeter("success", ":.1f"),
+                "eval_s": AverageMeter("eval_s", ":.3f"),
+            }
+            eval_tracker = MetricsTracker(
+                cfg.batch_size * accelerator.num_processes,
+                dataset.num_frames,
+                dataset.num_episodes,
+                eval_metrics,
+                initial_step=step,
+            )
+            eval_tracker.eval_s = eval_info["aggregated"].pop("eval_s")
+            eval_tracker.avg_sum_reward = eval_info["aggregated"].pop("avg_sum_reward")
+            eval_tracker.pc_success = eval_info["aggregated"].pop("pc_success")
+            logging.info(eval_tracker)
+
+            wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
+            for k, v in wandb_log_dict.items():
+                accelerator.log({f"{'eval'}/{k}": v}, step=step)
+
+            # Set back to training mode
+            policy.train()
+
+    # Wait for all processes to finish
+    accelerator.wait_for_everyone()
+
+    # Cleanup
+    if eval_env and accelerator.is_main_process:
+        eval_env.close()
+
+    if accelerator.is_main_process:
+        logging.info("End of training")
+        accelerator.end_training() # added by jade
+
+
+if __name__ == "__main__":
+    init_logging()
+    train()
diff --git a/tmux_log.txt b/tmux_log.txt
new file mode 100644
index 000000000..4936578bc
--- /dev/null
+++ b/tmux_log.txt
@@ -0,0 +1,2008 @@
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-08 13:23:15 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-08 13:23:15 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-08 13:23:15 ccelerate.py:99 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'HuggingFaceVLA/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.75,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': -1,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
+ 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
+.
+WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
+ 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
+.
+INFO 2025-09-08 13:23:15 celerate.py:149 Creating dataset
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 35414.48it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 5660.00it/s]
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 43760.67it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 5629.72it/s]
+c
+INFO 2025-09-08 13:23:22 celerate.py:160 Creating policy
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+c
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+[rank1]:[W908 13:23:22.785516795 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used b
+y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
+ can pecify device_id in init_process_group() to force use of a particular device.
+Reducing the number of VLM layers to 16 ...
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+[rank0]:[W908 13:23:43.028071493 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used b
+y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
+ can pecify device_id in init_process_group() to force use of a particular device.
+INFO 2025-09-08 13:23:43 celerate.py:171 Creating optimizer and scheduler
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-08 13:24:04 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla
+_lr1e-4bs32steps100000
+INFO 2025-09-08 13:24:04 celerate.py:213 cfg.env.task='libero_spatial'
+INFO 2025-09-08 13:24:04 celerate.py:214 cfg.steps=100000 (100K)
+INFO 2025-09-08 13:24:04 celerate.py:215 dataset.num_frames=273465 (273K)
+INFO 2025-09-08 13:24:04 celerate.py:216 dataset.num_episodes=1693
+INFO 2025-09-08 13:24:04 celerate.py:217 num_learnable_params=99880992 (100M)
+INFO 2025-09-08 13:24:04 celerate.py:218 num_total_params=450046220 (450M)
+INFO 2025-09-08 13:24:04 celerate.py:219 Number of processes: 2
+INFO 2025-09-08 13:24:04 celerate.py:220 Device: cuda:0
+INFO 2025-09-08 13:24:04 celerate.py:221 Mixed precision: bf16
+INFO 2025-09-08 13:24:04 celerate.py:243 Start offline training on a fixed dataset
+
+bach:  dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp
+', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima
+ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task'])
+> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train()
+-> train_tracker, output_dict = update_policy(
+(Pdb)
+bach:  dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp
+', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima
+ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task'])
+> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train()
+-> train_tracker, output_dict = update_policy(
+(Pdb) batch.keys()[rank0]:[W908 13:24:43.868440913 reducer.cpp:1430] Warning: find_unused_parameters=True was specif
+ied in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra tr
+aversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never h
+as any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false
+positive if your model has flow control causing later iterations to have unused parameters. (function operator())
+policy.config.input_features
+*** SyntaxError: invalid syntax
+(Pdb) policy.config.input_features
+*** AttributeError: 'DistributedDataParallel' object has no attribute 'config'
+(Pdb) policy
+DistributedDataParallel(
+  (module): SmolVLAPolicy(
+    (normalize_inputs): Normalize(
+      (buffer_observation_state): ParameterDict(
+          (mean): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)]
+          (std): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)]
+      )
+    )
+    (normalize_targets): Normalize(
+      (buffer_action): ParameterDict(
+          (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
+          (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
+      )
+    )
+    (unnormalize_outputs): Unnormalize(
+      (buffer_action): ParameterDict(
+          (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
+          (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
+      )
+    )
+    (model): VLAFlowMatching(
+      (vlm_with_expert): SmolVLMWithExpertModel(
+        (vlm): SmolVLMForConditionalGeneration(
+          (model): SmolVLMModel(
+            (vision_model): SmolVLMVisionTransformer(
+              (embeddings): SmolVLMVisionEmbeddings(
+                (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
+                (position_embedding): Embedding(1024, 768)
+              )
+              (encoder): SmolVLMEncoder(
+                (layers): ModuleList(
+                  (0-11): 12 x SmolVLMEncoderLayer(
+                    (self_attn): SmolVLMVisionAttention(
+                      (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                      (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                      (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                      (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                    )
+                    (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
+                    (mlp): SmolVLMVisionMLP(
+                      (activation_fn): PytorchGELUTanh()
+                      (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                      (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                    )
+                    (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
+                  )
+                )
+              )
+              (post_layernorm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
+            )
+            (connector): SmolVLMConnector(
+              (modality_projection): SmolVLMSimpleMLP(
+                (proj): Linear(in_features=12288, out_features=960, bias=False)
+              )
+            )
+            (text_model): LlamaModel(
+              (embed_tokens): Embedding(49280, 960, padding_idx=2)
+              (layers): ModuleList(
+                (0-15): 16 x LlamaDecoderLayer(
+                  (self_attn): LlamaAttention(
+                    (q_proj): Linear(in_features=960, out_features=960, bias=False)
+                    (k_proj): Linear(in_features=960, out_features=320, bias=False)
+                    (v_proj): Linear(in_features=960, out_features=320, bias=False)
+                    (o_proj): Linear(in_features=960, out_features=960, bias=False)
+                  )
+                  (mlp): LlamaMLP(
+                    (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
+                    (up_proj): Linear(in_features=960, out_features=2560, bias=False)
+                    (down_proj): Linear(in_features=2560, out_features=960, bias=False)
+                    (act_fn): SiLU()
+                  )
+                  (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
+                  (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
+                )
+              )
+              (norm): LlamaRMSNorm((960,), eps=1e-05)
+              (rotary_emb): LlamaRotaryEmbedding()
+            )
+          )
+          (lm_head): Linear(in_features=960, out_features=49280, bias=False)
+        )
+        (lm_expert): LlamaModel(
+          (embed_tokens): None
+          (layers): ModuleList(
+            (0): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (1): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (2): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (3): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (4): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (5): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (6): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (7): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (8): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (9): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (10): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (11): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (12): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (13): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (14): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (15): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+          )
+          (norm): LlamaRMSNorm((720,), eps=1e-05)
+          (rotary_emb): LlamaRotaryEmbedding()
+        )
+      )
+      (state_proj): Linear(in_features=32, out_features=960, bias=True)
+      (action_in_proj): Linear(in_features=32, out_features=720, bias=True)
+      (action_out_proj): Linear(in_features=720, out_features=32, bias=True)
+      (action_time_mlp_in): Linear(in_features=1440, out_features=720, bias=True)
+      (action_time_mlp_out): Linear(in_features=720, out_features=720, bias=True)
+    )
+  )
+)
+(Pdb) policy.config
+*** AttributeError: 'DistributedDataParallel' object has no attribute 'config'
+(Pdb) policy.input_features
+*** AttributeError: 'DistributedDataParallel' object has no attribute 'input_features'
+(Pdb) quit()
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 368, in <module>
+[rank1]:     train()
+[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+[rank1]:     response = fn(cfg, *args, **kwargs)
+[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train
+[rank1]:     train_tracker, output_dict = update_policy(
+[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train
+[rank1]:     train_tracker, output_dict = update_policy(
+[rank1]:   File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
+[rank1]:     return self.dispatch_line(frame)
+[rank1]:   File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
+[rank1]:     if self.quitting: raise BdbQuit
+[rank1]: bdb.BdbQuit
+W0908 13:25:34.274000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 776
+663 closing signal SIGTERM
+E0908 13:25:34.589000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1
+) local_rank: 1 (pid: 776664) of binary: /home/jade_choghari/miniconda3/envs/lerobot/bin/python
+Traceback (most recent call last):
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
+e 1245, in <module>
+    main()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
+e 1241, in main
+    launch_command(args)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
+e 1226, in launch_command
+    multi_gpu_launcher(args)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
+e 853, in multi_gpu_launcher
+    distrib_run.run(args)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/run.py", line 883
+, in run
+    elastic_launch(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py",
+ line 139, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py",
+ line 270, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+src/lerobot/scripts/train_accelerate.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-08_13:25:34
+  host      : hf-dgx-01
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 776664)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/7_train_acc.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000
+The following values were not passed to `accelerate launch` and had defaults used instead:
+                More than one GPU was found, enabling multi-GPU training.
+                If this was unintended please pass in `--num_processes=1`.
+        `--dynamo_backend` was set to a value of `'no'`
+To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/utils/launch.py:238: UserWarning
+: Port `29522` is already in use. Accelerate will attempt to launch in a standalone-like mode by finding an open por
+t automatically for this session. If this current attempt fails, or for more control in future runs, please specify
+a different port (e.g., `--main_process_port <your_chosen_port>`) or use `--main_process_port 0` for automatic selec
+tion in your launch command or Accelerate config file.
+  warnings.warn(
+INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'HuggingFaceVLA/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.75,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': -1,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'HuggingFaceVLA/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.75,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': -1,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
+ 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
+.
+WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
+ 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
+.
+INFO 2025-09-08 13:33:47 celerate.py:149 Creating dataset
+Resolving data files: 100%|████████████████████████████████| 1693/1693 [00:00<00:00, 103295.66it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 5229.81it/s]
+Resolving data files: 100%|████████████████████████████████| 1693/1693 [00:00<00:00, 360601.09it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 4881.54it/s]
+c
+INFO 2025-09-08 13:33:53 celerate.py:160 Creating policy
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+c
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+[rank1]:[W908 13:33:54.613597516 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used b
+y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
+ can pecify device_id in init_process_group() to force use of a particular device.
+Reducing the number of VLM layers to 16 ...
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+[rank0]:[W908 13:34:15.806448425 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used b
+y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
+ can pecify device_id in init_process_group() to force use of a particular device.
+INFO 2025-09-08 13:34:15 celerate.py:171 Creating optimizer and scheduler
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-08 13:34:36 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla
+_lr1e-4bs32steps100000
+INFO 2025-09-08 13:34:36 celerate.py:213 cfg.env.task='libero_spatial'
+INFO 2025-09-08 13:34:36 celerate.py:214 cfg.steps=100000 (100K)
+INFO 2025-09-08 13:34:36 celerate.py:215 dataset.num_frames=273465 (273K)
+INFO 2025-09-08 13:34:36 celerate.py:216 dataset.num_episodes=1693
+INFO 2025-09-08 13:34:36 celerate.py:217 num_learnable_params=99880992 (100M)
+INFO 2025-09-08 13:34:36 celerate.py:218 num_total_params=450046220 (450M)
+INFO 2025-09-08 13:34:36 celerate.py:219 Number of processes: 2
+INFO 2025-09-08 13:34:36 celerate.py:220 Device: cuda:0
+INFO 2025-09-08 13:34:36 celerate.py:221 Mixed precision: bf16
+INFO 2025-09-08 13:34:36 celerate.py:243 Start offline training on a fixed dataset
+[rank1]:[W908 13:34:39.454560620 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru
+ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut
+ograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused para
+meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m
+odel has flow control causing later iterations to have unused parameters. (function operator())
+[rank0]:[W908 13:34:40.502702504 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru
+ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut
+ograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused para
+meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m
+odel has flow control causing later iterations to have unused parameters. (function operator())
+INFO 2025-09-08 13:36:23 celerate.py:281 step:200 smpl:13K ep:79 epch:0.05 loss:0.963 grdn:2.699 lr:2.0e-05 updt_s:0
+.506 data_s:0.027
+INFO 2025-09-08 13:38:09 celerate.py:281 step:400 smpl:26K ep:158 epch:0.09 loss:0.389 grdn:3.127 lr:6.0e-05 updt_s:
+0.525 data_s:0.003
+INFO 2025-09-08 13:39:53 celerate.py:281 step:600 smpl:38K ep:238 epch:0.14 loss:0.261 grdn:2.618 lr:9.5e-05 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 13:41:37 celerate.py:281 step:800 smpl:51K ep:317 epch:0.19 loss:0.231 grdn:1.684 lr:9.9e-05 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 13:43:21 celerate.py:281 step:1K smpl:64K ep:396 epch:0.23 loss:0.211 grdn:1.258 lr:9.9e-05 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 13:45:05 celerate.py:281 step:1K smpl:77K ep:475 epch:0.28 loss:0.198 grdn:1.032 lr:9.9e-05 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 13:46:49 celerate.py:281 step:1K smpl:90K ep:555 epch:0.33 loss:0.182 grdn:0.880 lr:9.8e-05 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 13:48:33 celerate.py:281 step:2K smpl:102K ep:634 epch:0.37 loss:0.167 grdn:0.744 lr:9.8e-05 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 13:50:17 celerate.py:281 step:2K smpl:115K ep:713 epch:0.42 loss:0.157 grdn:0.680 lr:9.7e-05 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 13:52:01 celerate.py:281 step:2K smpl:128K ep:792 epch:0.47 loss:0.147 grdn:0.612 lr:9.6e-05 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 13:53:44 celerate.py:281 step:2K smpl:141K ep:872 epch:0.51 loss:0.142 grdn:0.576 lr:9.5e-05 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-08 13:55:27 celerate.py:281 step:2K smpl:154K ep:951 epch:0.56 loss:0.136 grdn:0.523 lr:9.4e-05 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 13:57:11 celerate.py:281 step:3K smpl:166K ep:1K epch:0.61 loss:0.132 grdn:0.509 lr:9.3e-05 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 13:58:57 celerate.py:281 step:3K smpl:179K ep:1K epch:0.66 loss:0.126 grdn:0.492 lr:9.2e-05 updt_s:0
+.525 data_s:0.003
+INFO 2025-09-08 14:00:43 celerate.py:281 step:3K smpl:192K ep:1K epch:0.70 loss:0.124 grdn:0.467 lr:9.1e-05 updt_s:0
+.525 data_s:0.003
+INFO 2025-09-08 14:02:26 celerate.py:281 step:3K smpl:205K ep:1K epch:0.75 loss:0.119 grdn:0.438 lr:9.0e-05 updt_s:0
+.508 data_s:0.003
+INFO 2025-09-08 14:04:27 celerate.py:281 step:3K smpl:218K ep:1K epch:0.80 loss:0.118 grdn:0.426 lr:8.9e-05 updt_s:0
+.564 data_s:0.039
+INFO 2025-09-08 14:06:10 celerate.py:281 step:4K smpl:230K ep:1K epch:0.84 loss:0.116 grdn:0.422 lr:8.7e-05 updt_s:0
+.511 data_s:0.004
+INFO 2025-09-08 14:07:55 celerate.py:281 step:4K smpl:243K ep:2K epch:0.89 loss:0.113 grdn:0.395 lr:8.6e-05 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 14:09:38 celerate.py:281 step:4K smpl:256K ep:2K epch:0.94 loss:0.111 grdn:0.401 lr:8.5e-05 updt_s:0
+.511 data_s:0.003
+INFO 2025-09-08 14:11:21 celerate.py:281 step:4K smpl:269K ep:2K epch:0.98 loss:0.110 grdn:0.380 lr:8.3e-05 updt_s:0
+.511 data_s:0.003
+INFO 2025-09-08 14:13:08 celerate.py:281 step:4K smpl:282K ep:2K epch:1.03 loss:0.109 grdn:0.381 lr:8.2e-05 updt_s:0
+.413 data_s:0.119
+INFO 2025-09-08 14:14:52 celerate.py:281 step:5K smpl:294K ep:2K epch:1.08 loss:0.107 grdn:0.387 lr:8.0e-05 updt_s:0
+.373 data_s:0.146
+INFO 2025-09-08 14:16:36 celerate.py:281 step:5K smpl:307K ep:2K epch:1.12 loss:0.107 grdn:0.366 lr:7.8e-05 updt_s:0
+.446 data_s:0.072
+INFO 2025-09-08 14:18:19 celerate.py:281 step:5K smpl:320K ep:2K epch:1.17 loss:0.105 grdn:0.347 lr:7.6e-05 updt_s:0
+.468 data_s:0.045
+INFO 2025-09-08 14:20:01 celerate.py:281 step:5K smpl:333K ep:2K epch:1.22 loss:0.103 grdn:0.350 lr:7.5e-05 updt_s:0
+.510 data_s:0.003
+INFO 2025-09-08 14:21:46 celerate.py:281 step:5K smpl:346K ep:2K epch:1.26 loss:0.101 grdn:0.336 lr:7.3e-05 updt_s:0
+.512 data_s:0.011
+INFO 2025-09-08 14:23:30 celerate.py:281 step:6K smpl:358K ep:2K epch:1.31 loss:0.102 grdn:0.345 lr:7.1e-05 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 14:25:15 celerate.py:281 step:6K smpl:371K ep:2K epch:1.36 loss:0.100 grdn:0.333 lr:6.9e-05 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 14:26:59 celerate.py:281 step:6K smpl:384K ep:2K epch:1.40 loss:0.100 grdn:0.328 lr:6.7e-05 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 14:28:43 celerate.py:281 step:6K smpl:397K ep:2K epch:1.45 loss:0.099 grdn:0.319 lr:6.5e-05 updt_s:0
+.512 data_s:0.003
+INFO 2025-09-08 14:30:26 celerate.py:281 step:6K smpl:410K ep:3K epch:1.50 loss:0.098 grdn:0.313 lr:6.3e-05 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 14:32:11 celerate.py:281 step:7K smpl:422K ep:3K epch:1.54 loss:0.097 grdn:0.319 lr:6.1e-05 updt_s:0
+.519 data_s:0.004
+INFO 2025-09-08 14:33:55 celerate.py:281 step:7K smpl:435K ep:3K epch:1.59 loss:0.097 grdn:0.312 lr:5.9e-05 updt_s:0
+.506 data_s:0.010
+INFO 2025-09-08 14:35:39 celerate.py:281 step:7K smpl:448K ep:3K epch:1.64 loss:0.097 grdn:0.307 lr:5.7e-05 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 14:37:23 celerate.py:281 step:7K smpl:461K ep:3K epch:1.69 loss:0.095 grdn:0.294 lr:5.5e-05 updt_s:0
+.518 data_s:0.003
+INFO 2025-09-08 14:39:07 celerate.py:281 step:7K smpl:474K ep:3K epch:1.73 loss:0.095 grdn:0.299 lr:5.3e-05 updt_s:0
+.507 data_s:0.007
+INFO 2025-09-08 14:40:52 celerate.py:281 step:8K smpl:486K ep:3K epch:1.78 loss:0.094 grdn:0.283 lr:5.1e-05 updt_s:0
+.523 data_s:0.003
+INFO 2025-09-08 14:42:36 celerate.py:281 step:8K smpl:499K ep:3K epch:1.83 loss:0.093 grdn:0.284 lr:4.9e-05 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 14:44:22 celerate.py:281 step:8K smpl:512K ep:3K epch:1.87 loss:0.092 grdn:0.284 lr:4.7e-05 updt_s:0
+.465 data_s:0.060
+INFO 2025-09-08 14:46:06 celerate.py:281 step:8K smpl:525K ep:3K epch:1.92 loss:0.093 grdn:0.292 lr:4.5e-05 updt_s:0
+.456 data_s:0.066
+INFO 2025-09-08 14:47:49 celerate.py:281 step:8K smpl:538K ep:3K epch:1.97 loss:0.093 grdn:0.290 lr:4.3e-05 updt_s:0
+.510 data_s:0.003
+INFO 2025-09-08 14:49:37 celerate.py:281 step:9K smpl:550K ep:3K epch:2.01 loss:0.092 grdn:0.283 lr:4.1e-05 updt_s:0
+.419 data_s:0.117
+INFO 2025-09-08 14:51:20 celerate.py:281 step:9K smpl:563K ep:3K epch:2.06 loss:0.092 grdn:0.275 lr:3.9e-05 updt_s:0
+.463 data_s:0.053
+INFO 2025-09-08 14:53:05 celerate.py:281 step:9K smpl:576K ep:4K epch:2.11 loss:0.090 grdn:0.272 lr:3.7e-05 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 14:54:49 celerate.py:281 step:9K smpl:589K ep:4K epch:2.15 loss:0.090 grdn:0.268 lr:3.5e-05 updt_s:0
+.506 data_s:0.013
+INFO 2025-09-08 14:56:32 celerate.py:281 step:9K smpl:602K ep:4K epch:2.20 loss:0.090 grdn:0.271 lr:3.3e-05 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 14:58:17 celerate.py:281 step:10K smpl:614K ep:4K epch:2.25 loss:0.090 grdn:0.268 lr:3.1e-05 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-08 15:00:02 celerate.py:281 step:10K smpl:627K ep:4K epch:2.29 loss:0.089 grdn:0.261 lr:3.0e-05 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-08 15:01:48 celerate.py:281 step:10K smpl:640K ep:4K epch:2.34 loss:0.090 grdn:0.271 lr:2.8e-05 updt_s:
+0.526 data_s:0.003
+INFO 2025-09-08 15:03:33 celerate.py:281 step:10K smpl:653K ep:4K epch:2.39 loss:0.089 grdn:0.262 lr:2.6e-05 updt_s:
+0.521 data_s:0.003
+INFO 2025-09-08 15:05:18 celerate.py:281 step:10K smpl:666K ep:4K epch:2.43 loss:0.090 grdn:0.264 lr:2.4e-05 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-08 15:07:32 celerate.py:281 step:11K smpl:678K ep:4K epch:2.48 loss:0.089 grdn:0.255 lr:2.3e-05 updt_s:
+0.663 data_s:0.004
+INFO 2025-09-08 15:09:21 celerate.py:281 step:11K smpl:691K ep:4K epch:2.53 loss:0.090 grdn:0.263 lr:2.1e-05 updt_s:
+0.514 data_s:0.030
+INFO 2025-09-08 15:11:06 celerate.py:281 step:11K smpl:704K ep:4K epch:2.57 loss:0.088 grdn:0.254 lr:1.9e-05 updt_s:
+0.517 data_s:0.006
+INFO 2025-09-08 15:12:51 celerate.py:281 step:11K smpl:717K ep:4K epch:2.62 loss:0.088 grdn:0.252 lr:1.8e-05 updt_s:
+0.517 data_s:0.005
+INFO 2025-09-08 15:14:38 celerate.py:281 step:11K smpl:730K ep:5K epch:2.67 loss:0.088 grdn:0.251 lr:1.6e-05 updt_s:
+0.532 data_s:0.003
+INFO 2025-09-08 15:16:23 celerate.py:281 step:12K smpl:742K ep:5K epch:2.71 loss:0.088 grdn:0.253 lr:1.5e-05 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-08 15:18:08 celerate.py:281 step:12K smpl:755K ep:5K epch:2.76 loss:0.087 grdn:0.244 lr:1.4e-05 updt_s:
+0.521 data_s:0.003
+INFO 2025-09-08 15:19:54 celerate.py:281 step:12K smpl:768K ep:5K epch:2.81 loss:0.088 grdn:0.247 lr:1.2e-05 updt_s:
+0.524 data_s:0.003
+INFO 2025-09-08 15:21:39 celerate.py:281 step:12K smpl:781K ep:5K epch:2.86 loss:0.087 grdn:0.242 lr:1.1e-05 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-08 15:23:32 celerate.py:281 step:12K smpl:794K ep:5K epch:2.90 loss:0.088 grdn:0.243 lr:1.0e-05 updt_s:
+0.560 data_s:0.003
+INFO 2025-09-08 15:25:48 celerate.py:281 step:13K smpl:806K ep:5K epch:2.95 loss:0.087 grdn:0.240 lr:9.0e-06 updt_s:
+0.674 data_s:0.005
+INFO 2025-09-08 15:28:02 celerate.py:281 step:13K smpl:819K ep:5K epch:3.00 loss:0.088 grdn:0.245 lr:8.0e-06 updt_s:
+0.662 data_s:0.004
+INFO 2025-09-08 15:31:06 celerate.py:281 step:13K smpl:832K ep:5K epch:3.04 loss:0.086 grdn:0.236 lr:7.1e-06 updt_s:
+0.688 data_s:0.231
+INFO 2025-09-08 15:32:52 celerate.py:281 step:13K smpl:845K ep:5K epch:3.09 loss:0.087 grdn:0.231 lr:6.3e-06 updt_s:
+0.521 data_s:0.003
+INFO 2025-09-08 15:35:46 celerate.py:281 step:13K smpl:858K ep:5K epch:3.14 loss:0.088 grdn:0.235 lr:5.6e-06 updt_s:
+0.637 data_s:0.232
+INFO 2025-09-08 15:37:34 celerate.py:281 step:14K smpl:870K ep:5K epch:3.18 loss:0.087 grdn:0.238 lr:4.9e-06 updt_s:
+0.514 data_s:0.025
+INFO 2025-09-08 15:39:18 celerate.py:281 step:14K smpl:883K ep:5K epch:3.23 loss:0.087 grdn:0.226 lr:4.3e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 15:41:02 celerate.py:281 step:14K smpl:896K ep:6K epch:3.28 loss:0.087 grdn:0.230 lr:3.8e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 15:42:45 celerate.py:281 step:14K smpl:909K ep:6K epch:3.32 loss:0.086 grdn:0.229 lr:3.4e-06 updt_s:
+0.507 data_s:0.008
+INFO 2025-09-08 15:44:29 celerate.py:281 step:14K smpl:922K ep:6K epch:3.37 loss:0.087 grdn:0.229 lr:3.0e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 15:46:12 celerate.py:281 step:15K smpl:934K ep:6K epch:3.42 loss:0.087 grdn:0.228 lr:2.8e-06 updt_s:
+0.502 data_s:0.011
+INFO 2025-09-08 15:47:56 celerate.py:281 step:15K smpl:947K ep:6K epch:3.46 loss:0.086 grdn:0.232 lr:2.6e-06 updt_s:
+0.515 data_s:0.004
+INFO 2025-09-08 15:49:39 celerate.py:281 step:15K smpl:960K ep:6K epch:3.51 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:
+0.365 data_s:0.147
+INFO 2025-09-08 15:51:22 celerate.py:281 step:15K smpl:973K ep:6K epch:3.56 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:
+0.333 data_s:0.179
+INFO 2025-09-08 15:53:07 celerate.py:281 step:15K smpl:986K ep:6K epch:3.60 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:
+0.357 data_s:0.164
+INFO 2025-09-08 15:54:50 celerate.py:281 step:16K smpl:998K ep:6K epch:3.65 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s:
+0.365 data_s:0.151
+INFO 2025-09-08 15:56:35 celerate.py:281 step:16K smpl:1M ep:6K epch:3.70 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0.
+450 data_s:0.071
+INFO 2025-09-08 15:58:19 celerate.py:281 step:16K smpl:1M ep:6K epch:3.74 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0.
+495 data_s:0.023
+INFO 2025-09-08 16:00:04 celerate.py:281 step:16K smpl:1M ep:6K epch:3.79 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0.
+393 data_s:0.131
+INFO 2025-09-08 16:01:48 celerate.py:281 step:16K smpl:1M ep:6K epch:3.84 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0.
+515 data_s:0.003
+INFO 2025-09-08 16:03:32 celerate.py:281 step:17K smpl:1M ep:7K epch:3.88 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
+518 data_s:0.003
+INFO 2025-09-08 16:05:17 celerate.py:281 step:17K smpl:1M ep:7K epch:3.93 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
+397 data_s:0.125
+INFO 2025-09-08 16:12:41 celerate.py:281 step:17K smpl:1M ep:7K epch:3.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:1.
+496 data_s:0.719
+INFO 2025-09-08 16:14:28 celerate.py:281 step:17K smpl:1M ep:7K epch:4.03 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0.
+413 data_s:0.124
+INFO 2025-09-08 16:16:13 celerate.py:281 step:17K smpl:1M ep:7K epch:4.07 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0.
+469 data_s:0.050
+INFO 2025-09-08 16:17:57 celerate.py:281 step:18K smpl:1M ep:7K epch:4.12 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0.
+375 data_s:0.145
+INFO 2025-09-08 16:19:42 celerate.py:281 step:18K smpl:1M ep:7K epch:4.17 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0.
+333 data_s:0.189
+INFO 2025-09-08 16:21:26 celerate.py:281 step:18K smpl:1M ep:7K epch:4.21 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
+334 data_s:0.184
+INFO 2025-09-08 16:23:09 celerate.py:281 step:18K smpl:1M ep:7K epch:4.26 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0.
+331 data_s:0.185
+INFO 2025-09-08 16:24:53 celerate.py:281 step:18K smpl:1M ep:7K epch:4.31 loss:0.088 grdn:0.236 lr:2.5e-06 updt_s:0.
+333 data_s:0.182
+INFO 2025-09-08 16:26:38 celerate.py:281 step:19K smpl:1M ep:7K epch:4.35 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0.
+337 data_s:0.188
+INFO 2025-09-08 16:28:22 celerate.py:281 step:19K smpl:1M ep:7K epch:4.40 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0.
+420 data_s:0.099
+INFO 2025-09-08 16:30:06 celerate.py:281 step:19K smpl:1M ep:8K epch:4.45 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
+444 data_s:0.075
+INFO 2025-09-08 16:31:49 celerate.py:281 step:19K smpl:1M ep:8K epch:4.49 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0.
+475 data_s:0.036
+INFO 2025-09-08 16:33:33 celerate.py:281 step:19K smpl:1M ep:8K epch:4.54 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0.
+379 data_s:0.139
+INFO 2025-09-08 16:35:17 celerate.py:281 step:20K smpl:1M ep:8K epch:4.59 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0.
+348 data_s:0.171
+INFO 2025-09-08 16:37:01 celerate.py:281 step:20K smpl:1M ep:8K epch:4.63 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
+332 data_s:0.185
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+INFO 2025-09-08 16:38:46 celerate.py:281 step:20K smpl:1M ep:8K epch:4.68 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0.
+486 data_s:0.037
+INFO 2025-09-08 16:38:46 celerate.py:295 Checkpoint policy after step 20000
+INFO 2025-09-08 16:40:30 celerate.py:281 step:20K smpl:1M ep:8K epch:4.73 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
+509 data_s:0.003
+INFO 2025-09-08 16:42:16 celerate.py:281 step:20K smpl:1M ep:8K epch:4.77 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0.
+527 data_s:0.003
+INFO 2025-09-08 16:44:01 celerate.py:281 step:21K smpl:1M ep:8K epch:4.82 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0.
+519 data_s:0.003
+INFO 2025-09-08 16:45:45 celerate.py:281 step:21K smpl:1M ep:8K epch:4.87 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0.
+504 data_s:0.013
+INFO 2025-09-08 16:47:29 celerate.py:281 step:21K smpl:1M ep:8K epch:4.91 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0.
+509 data_s:0.011
+INFO 2025-09-08 16:49:19 celerate.py:281 step:21K smpl:1M ep:8K epch:4.96 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0.
+544 data_s:0.003
+INFO 2025-09-08 16:51:04 celerate.py:281 step:21K smpl:1M ep:8K epch:5.01 loss:0.086 grdn:0.225 lr:2.5e-06 updt_s:0.
+488 data_s:0.039
+INFO 2025-09-08 16:52:51 celerate.py:281 step:22K smpl:1M ep:9K epch:5.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
+430 data_s:0.099
+INFO 2025-09-08 16:54:36 celerate.py:281 step:22K smpl:1M ep:9K epch:5.10 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0.
+521 data_s:0.003
+INFO 2025-09-08 16:56:23 celerate.py:281 step:22K smpl:1M ep:9K epch:5.15 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0.
+521 data_s:0.014
+INFO 2025-09-08 16:58:09 celerate.py:281 step:22K smpl:1M ep:9K epch:5.20 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0.
+525 data_s:0.003
+INFO 2025-09-08 17:00:04 celerate.py:281 step:22K smpl:1M ep:9K epch:5.24 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0.
+568 data_s:0.003
+INFO 2025-09-08 17:02:00 celerate.py:281 step:23K smpl:1M ep:9K epch:5.29 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0.
+575 data_s:0.003
+INFO 2025-09-08 17:03:49 celerate.py:281 step:23K smpl:1M ep:9K epch:5.34 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0.
+513 data_s:0.030
+INFO 2025-09-08 17:05:39 celerate.py:281 step:23K smpl:1M ep:9K epch:5.38 loss:0.085 grdn:0.227 lr:2.5e-06 updt_s:0.
+523 data_s:0.027
+INFO 2025-09-08 17:07:26 celerate.py:281 step:23K smpl:1M ep:9K epch:5.43 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0.
+529 data_s:0.003
+INFO 2025-09-08 17:09:12 celerate.py:281 step:23K smpl:1M ep:9K epch:5.48 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
+526 data_s:0.003
+INFO 2025-09-08 17:10:55 celerate.py:281 step:24K smpl:2M ep:9K epch:5.52 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s:0.
+443 data_s:0.072
+INFO 2025-09-08 17:12:40 celerate.py:281 step:24K smpl:2M ep:9K epch:5.57 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0.
+518 data_s:0.004
+INFO 2025-09-08 17:14:25 celerate.py:281 step:24K smpl:2M ep:10K epch:5.62 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 17:16:11 celerate.py:281 step:24K smpl:2M ep:10K epch:5.66 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0
+.523 data_s:0.003
+INFO 2025-09-08 17:17:55 celerate.py:281 step:24K smpl:2M ep:10K epch:5.71 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0
+.515 data_s:0.005
+INFO 2025-09-08 17:19:39 celerate.py:281 step:25K smpl:2M ep:10K epch:5.76 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0
+.415 data_s:0.106
+INFO 2025-09-08 17:21:24 celerate.py:281 step:25K smpl:2M ep:10K epch:5.80 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.507 data_s:0.016
+INFO 2025-09-08 17:23:08 celerate.py:281 step:25K smpl:2M ep:10K epch:5.85 loss:0.085 grdn:0.229 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 17:24:54 celerate.py:281 step:25K smpl:2M ep:10K epch:5.90 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0
+.518 data_s:0.008
+INFO 2025-09-08 17:26:41 celerate.py:281 step:25K smpl:2M ep:10K epch:5.94 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.529 data_s:0.003
+INFO 2025-09-08 17:28:24 celerate.py:281 step:26K smpl:2M ep:10K epch:5.99 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 17:30:11 celerate.py:281 step:26K smpl:2M ep:10K epch:6.04 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0
+.370 data_s:0.164
+INFO 2025-09-08 17:31:55 celerate.py:281 step:26K smpl:2M ep:10K epch:6.08 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
+.385 data_s:0.132
+INFO 2025-09-08 17:33:39 celerate.py:281 step:26K smpl:2M ep:10K epch:6.13 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.450 data_s:0.069
+INFO 2025-09-08 17:35:24 celerate.py:281 step:26K smpl:2M ep:10K epch:6.18 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0
+.468 data_s:0.052
+INFO 2025-09-08 17:37:07 celerate.py:281 step:27K smpl:2M ep:11K epch:6.23 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0
+.514 data_s:0.004
+INFO 2025-09-08 17:38:52 celerate.py:281 step:27K smpl:2M ep:11K epch:6.27 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.519 data_s:0.003
+INFO 2025-09-08 17:40:40 celerate.py:281 step:27K smpl:2M ep:11K epch:6.32 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.534 data_s:0.003
+INFO 2025-09-08 17:42:57 celerate.py:281 step:27K smpl:2M ep:11K epch:6.37 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
+.678 data_s:0.007
+INFO 2025-09-08 17:46:13 celerate.py:281 step:27K smpl:2M ep:11K epch:6.41 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.968 data_s:0.009
+INFO 2025-09-08 17:49:20 celerate.py:281 step:28K smpl:2M ep:11K epch:6.46 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
+.895 data_s:0.037
+INFO 2025-09-08 17:51:22 celerate.py:281 step:28K smpl:2M ep:11K epch:6.51 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
+.604 data_s:0.003
+INFO 2025-09-08 17:53:07 celerate.py:281 step:28K smpl:2M ep:11K epch:6.55 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 17:54:51 celerate.py:281 step:28K smpl:2M ep:11K epch:6.60 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 17:56:36 celerate.py:281 step:28K smpl:2M ep:11K epch:6.65 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.519 data_s:0.003
+INFO 2025-09-08 17:58:21 celerate.py:281 step:29K smpl:2M ep:11K epch:6.69 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 18:00:06 celerate.py:281 step:29K smpl:2M ep:11K epch:6.74 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.513 data_s:0.011
+INFO 2025-09-08 18:01:50 celerate.py:281 step:29K smpl:2M ep:11K epch:6.79 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.476 data_s:0.041
+INFO 2025-09-08 18:03:34 celerate.py:281 step:29K smpl:2M ep:12K epch:6.83 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
+.506 data_s:0.012
+INFO 2025-09-08 18:05:21 celerate.py:281 step:29K smpl:2M ep:12K epch:6.88 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0
+.455 data_s:0.075
+INFO 2025-09-08 18:07:04 celerate.py:281 step:30K smpl:2M ep:12K epch:6.93 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 18:08:47 celerate.py:281 step:30K smpl:2M ep:12K epch:6.97 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
+.509 data_s:0.003
+INFO 2025-09-08 18:10:33 celerate.py:281 step:30K smpl:2M ep:12K epch:7.02 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.422 data_s:0.105
+INFO 2025-09-08 18:12:19 celerate.py:281 step:30K smpl:2M ep:12K epch:7.07 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.347 data_s:0.182
+INFO 2025-09-08 18:14:05 celerate.py:281 step:30K smpl:2M ep:12K epch:7.11 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
+.473 data_s:0.053
+INFO 2025-09-08 18:15:52 celerate.py:281 step:31K smpl:2M ep:12K epch:7.16 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0
+.531 data_s:0.005
+INFO 2025-09-08 18:17:37 celerate.py:281 step:31K smpl:2M ep:12K epch:7.21 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.520 data_s:0.003
+INFO 2025-09-08 18:19:22 celerate.py:281 step:31K smpl:2M ep:12K epch:7.26 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
+.500 data_s:0.020
+INFO 2025-09-08 18:21:06 celerate.py:281 step:31K smpl:2M ep:12K epch:7.30 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0
+.511 data_s:0.009
+INFO 2025-09-08 18:22:50 celerate.py:281 step:31K smpl:2M ep:12K epch:7.35 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0
+.518 data_s:0.003
+INFO 2025-09-08 18:24:33 celerate.py:281 step:32K smpl:2M ep:13K epch:7.40 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.507 data_s:0.007
+INFO 2025-09-08 18:26:16 celerate.py:281 step:32K smpl:2M ep:13K epch:7.44 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0
+.463 data_s:0.047
+INFO 2025-09-08 18:27:59 celerate.py:281 step:32K smpl:2M ep:13K epch:7.49 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:0
+.509 data_s:0.007
+INFO 2025-09-08 18:29:43 celerate.py:281 step:32K smpl:2M ep:13K epch:7.54 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 18:31:26 celerate.py:281 step:32K smpl:2M ep:13K epch:7.58 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.511 data_s:0.003
+INFO 2025-09-08 18:33:11 celerate.py:281 step:33K smpl:2M ep:13K epch:7.63 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.518 data_s:0.003
+INFO 2025-09-08 18:34:57 celerate.py:281 step:33K smpl:2M ep:13K epch:7.68 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.512 data_s:0.016
+INFO 2025-09-08 18:36:41 celerate.py:281 step:33K smpl:2M ep:13K epch:7.72 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 18:38:25 celerate.py:281 step:33K smpl:2M ep:13K epch:7.77 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 18:40:07 celerate.py:281 step:33K smpl:2M ep:13K epch:7.82 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.506 data_s:0.003
+INFO 2025-09-08 18:41:51 celerate.py:281 step:34K smpl:2M ep:13K epch:7.86 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.509 data_s:0.006
+INFO 2025-09-08 18:43:36 celerate.py:281 step:34K smpl:2M ep:13K epch:7.91 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 18:45:19 celerate.py:281 step:34K smpl:2M ep:13K epch:7.96 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.511 data_s:0.003
+INFO 2025-09-08 18:47:05 celerate.py:281 step:34K smpl:2M ep:14K epch:8.00 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.495 data_s:0.035
+INFO 2025-09-08 18:48:51 celerate.py:281 step:34K smpl:2M ep:14K epch:8.05 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0
+.413 data_s:0.112
+INFO 2025-09-08 18:50:34 celerate.py:281 step:35K smpl:2M ep:14K epch:8.10 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 18:52:19 celerate.py:281 step:35K smpl:2M ep:14K epch:8.14 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.520 data_s:0.003
+INFO 2025-09-08 18:54:03 celerate.py:281 step:35K smpl:2M ep:14K epch:8.19 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 18:55:48 celerate.py:281 step:35K smpl:2M ep:14K epch:8.24 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
+.420 data_s:0.101
+INFO 2025-09-08 18:57:33 celerate.py:281 step:35K smpl:2M ep:14K epch:8.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
+.506 data_s:0.022
+INFO 2025-09-08 18:59:19 celerate.py:281 step:36K smpl:2M ep:14K epch:8.33 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
+.525 data_s:0.003
+INFO 2025-09-08 19:01:03 celerate.py:281 step:36K smpl:2M ep:14K epch:8.38 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 19:02:48 celerate.py:281 step:36K smpl:2M ep:14K epch:8.43 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 19:04:32 celerate.py:281 step:36K smpl:2M ep:14K epch:8.47 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
+.505 data_s:0.013
+INFO 2025-09-08 19:06:15 celerate.py:281 step:36K smpl:2M ep:14K epch:8.52 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.384 data_s:0.130
+INFO 2025-09-08 19:07:58 celerate.py:281 step:37K smpl:2M ep:15K epch:8.57 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
+.430 data_s:0.084
+INFO 2025-09-08 19:09:41 celerate.py:281 step:37K smpl:2M ep:15K epch:8.61 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.351 data_s:0.162
+INFO 2025-09-08 19:11:25 celerate.py:281 step:37K smpl:2M ep:15K epch:8.66 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0
+.337 data_s:0.181
+INFO 2025-09-08 19:13:08 celerate.py:281 step:37K smpl:2M ep:15K epch:8.71 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.336 data_s:0.177
+INFO 2025-09-08 19:14:52 celerate.py:281 step:37K smpl:2M ep:15K epch:8.75 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.344 data_s:0.174
+INFO 2025-09-08 19:16:35 celerate.py:281 step:38K smpl:2M ep:15K epch:8.80 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
+.332 data_s:0.182
+INFO 2025-09-08 19:18:18 celerate.py:281 step:38K smpl:2M ep:15K epch:8.85 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.332 data_s:0.182
+INFO 2025-09-08 19:20:01 celerate.py:281 step:38K smpl:2M ep:15K epch:8.89 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0
+.337 data_s:0.177
+INFO 2025-09-08 19:21:45 celerate.py:281 step:38K smpl:2M ep:15K epch:8.94 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.495 data_s:0.022
+INFO 2025-09-08 19:23:29 celerate.py:281 step:38K smpl:2M ep:15K epch:8.99 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 19:25:15 celerate.py:281 step:39K smpl:2M ep:15K epch:9.03 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.436 data_s:0.097
+INFO 2025-09-08 19:26:59 celerate.py:281 step:39K smpl:2M ep:15K epch:9.08 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
+.378 data_s:0.138
+INFO 2025-09-08 19:28:43 celerate.py:281 step:39K smpl:2M ep:15K epch:9.13 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:0
+.497 data_s:0.023
+INFO 2025-09-08 19:30:27 celerate.py:281 step:39K smpl:3M ep:16K epch:9.17 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 19:32:14 celerate.py:281 step:39K smpl:3M ep:16K epch:9.22 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.526 data_s:0.003
+INFO 2025-09-08 19:33:57 celerate.py:281 step:40K smpl:3M ep:16K epch:9.27 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.506 data_s:0.011
+INFO 2025-09-08 19:35:42 celerate.py:281 step:40K smpl:3M ep:16K epch:9.31 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0
+.455 data_s:0.066
+INFO 2025-09-08 19:37:26 celerate.py:281 step:40K smpl:3M ep:16K epch:9.36 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0
+.493 data_s:0.026
+INFO 2025-09-08 19:37:26 celerate.py:295 Checkpoint policy after step 40000
+INFO 2025-09-08 19:39:10 celerate.py:281 step:40K smpl:3M ep:16K epch:9.41 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
+.397 data_s:0.114
+INFO 2025-09-08 19:40:53 celerate.py:281 step:40K smpl:3M ep:16K epch:9.45 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0
+.344 data_s:0.168
+INFO 2025-09-08 19:42:37 celerate.py:281 step:41K smpl:3M ep:16K epch:9.50 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.480 data_s:0.036
+INFO 2025-09-08 19:44:21 celerate.py:281 step:41K smpl:3M ep:16K epch:9.55 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 19:46:05 celerate.py:281 step:41K smpl:3M ep:16K epch:9.60 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 19:47:49 celerate.py:281 step:41K smpl:3M ep:16K epch:9.64 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 19:49:33 celerate.py:281 step:41K smpl:3M ep:16K epch:9.69 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 19:51:17 celerate.py:281 step:42K smpl:3M ep:16K epch:9.74 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 19:53:00 celerate.py:281 step:42K smpl:3M ep:17K epch:9.78 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 19:54:44 celerate.py:281 step:42K smpl:3M ep:17K epch:9.83 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 19:56:28 celerate.py:281 step:42K smpl:3M ep:17K epch:9.88 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.512 data_s:0.003
+INFO 2025-09-08 19:58:11 celerate.py:281 step:42K smpl:3M ep:17K epch:9.92 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 19:59:55 celerate.py:281 step:43K smpl:3M ep:17K epch:9.97 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 20:01:42 celerate.py:281 step:43K smpl:3M ep:17K epch:10.02 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:
+0.476 data_s:0.057
+INFO 2025-09-08 20:03:25 celerate.py:281 step:43K smpl:3M ep:17K epch:10.06 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.471 data_s:0.043
+INFO 2025-09-08 20:05:09 celerate.py:281 step:43K smpl:3M ep:17K epch:10.11 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.515 data_s:0.004
+INFO 2025-09-08 20:06:53 celerate.py:281 step:43K smpl:3M ep:17K epch:10.16 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.505 data_s:0.013
+INFO 2025-09-08 20:08:36 celerate.py:281 step:44K smpl:3M ep:17K epch:10.20 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 20:10:20 celerate.py:281 step:44K smpl:3M ep:17K epch:10.25 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 20:12:04 celerate.py:281 step:44K smpl:3M ep:17K epch:10.30 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 20:13:47 celerate.py:281 step:44K smpl:3M ep:18K epch:10.34 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.503 data_s:0.011
+INFO 2025-09-08 20:15:31 celerate.py:281 step:44K smpl:3M ep:18K epch:10.39 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.416 data_s:0.102
+INFO 2025-09-08 20:17:15 celerate.py:281 step:45K smpl:3M ep:18K epch:10.44 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.502 data_s:0.017
+INFO 2025-09-08 20:18:58 celerate.py:281 step:45K smpl:3M ep:18K epch:10.48 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 20:20:41 celerate.py:281 step:45K smpl:3M ep:18K epch:10.53 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.496 data_s:0.017
+INFO 2025-09-08 20:22:24 celerate.py:281 step:45K smpl:3M ep:18K epch:10.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.493 data_s:0.022
+INFO 2025-09-08 20:24:08 celerate.py:281 step:45K smpl:3M ep:18K epch:10.63 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.485 data_s:0.031
+INFO 2025-09-08 20:25:52 celerate.py:281 step:46K smpl:3M ep:18K epch:10.67 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 20:27:36 celerate.py:281 step:46K smpl:3M ep:18K epch:10.72 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 20:29:19 celerate.py:281 step:46K smpl:3M ep:18K epch:10.77 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 20:31:04 celerate.py:281 step:46K smpl:3M ep:18K epch:10.81 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 20:32:47 celerate.py:281 step:46K smpl:3M ep:18K epch:10.86 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 20:34:32 celerate.py:281 step:47K smpl:3M ep:18K epch:10.91 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 20:36:15 celerate.py:281 step:47K smpl:3M ep:19K epch:10.95 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 20:38:01 celerate.py:281 step:47K smpl:3M ep:19K epch:11.00 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.416 data_s:0.114
+INFO 2025-09-08 20:39:45 celerate.py:281 step:47K smpl:3M ep:19K epch:11.05 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.429 data_s:0.086
+INFO 2025-09-08 20:41:28 celerate.py:281 step:47K smpl:3M ep:19K epch:11.09 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:
+0.409 data_s:0.106
+INFO 2025-09-08 20:43:12 celerate.py:281 step:48K smpl:3M ep:19K epch:11.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.470 data_s:0.050
+INFO 2025-09-08 20:44:56 celerate.py:281 step:48K smpl:3M ep:19K epch:11.19 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 20:46:41 celerate.py:281 step:48K smpl:3M ep:19K epch:11.23 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 20:48:25 celerate.py:281 step:48K smpl:3M ep:19K epch:11.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 20:50:08 celerate.py:281 step:48K smpl:3M ep:19K epch:11.33 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.505 data_s:0.009
+INFO 2025-09-08 20:51:52 celerate.py:281 step:49K smpl:3M ep:19K epch:11.37 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.449 data_s:0.067
+INFO 2025-09-08 20:53:35 celerate.py:281 step:49K smpl:3M ep:19K epch:11.42 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.420 data_s:0.094
+INFO 2025-09-08 20:55:19 celerate.py:281 step:49K smpl:3M ep:19K epch:11.47 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 20:57:01 celerate.py:281 step:49K smpl:3M ep:19K epch:11.51 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
+0.505 data_s:0.003
+INFO 2025-09-08 20:58:44 celerate.py:281 step:49K smpl:3M ep:20K epch:11.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 21:00:28 celerate.py:281 step:50K smpl:3M ep:20K epch:11.61 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 21:02:11 celerate.py:281 step:50K smpl:3M ep:20K epch:11.65 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.402 data_s:0.110
+INFO 2025-09-08 21:03:54 celerate.py:281 step:50K smpl:3M ep:20K epch:11.70 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.332 data_s:0.184
+INFO 2025-09-08 21:05:37 celerate.py:281 step:50K smpl:3M ep:20K epch:11.75 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:
+0.332 data_s:0.182
+INFO 2025-09-08 21:07:21 celerate.py:281 step:50K smpl:3M ep:20K epch:11.80 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.466 data_s:0.049
+INFO 2025-09-08 21:09:05 celerate.py:281 step:51K smpl:3M ep:20K epch:11.84 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:10:49 celerate.py:281 step:51K smpl:3M ep:20K epch:11.89 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
+0.512 data_s:0.004
+INFO 2025-09-08 21:12:32 celerate.py:281 step:51K smpl:3M ep:20K epch:11.94 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s:
+0.484 data_s:0.032
+INFO 2025-09-08 21:14:17 celerate.py:281 step:51K smpl:3M ep:20K epch:11.98 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
+0.517 data_s:0.004
+INFO 2025-09-08 21:16:03 celerate.py:281 step:51K smpl:3M ep:20K epch:12.03 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.424 data_s:0.105
+INFO 2025-09-08 21:17:46 celerate.py:281 step:52K smpl:3M ep:20K epch:12.08 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.442 data_s:0.073
+INFO 2025-09-08 21:19:30 celerate.py:281 step:52K smpl:3M ep:21K epch:12.12 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:
+0.511 data_s:0.007
+INFO 2025-09-08 21:21:15 celerate.py:281 step:52K smpl:3M ep:21K epch:12.17 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-08 21:22:59 celerate.py:281 step:52K smpl:3M ep:21K epch:12.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 21:24:43 celerate.py:281 step:52K smpl:3M ep:21K epch:12.26 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 21:26:27 celerate.py:281 step:53K smpl:3M ep:21K epch:12.31 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 21:28:11 celerate.py:281 step:53K smpl:3M ep:21K epch:12.36 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:29:55 celerate.py:281 step:53K smpl:3M ep:21K epch:12.40 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 21:31:39 celerate.py:281 step:53K smpl:3M ep:21K epch:12.45 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:33:23 celerate.py:281 step:53K smpl:3M ep:21K epch:12.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 21:35:08 celerate.py:281 step:54K smpl:3M ep:21K epch:12.54 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-08 21:36:51 celerate.py:281 step:54K smpl:3M ep:21K epch:12.59 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 21:38:36 celerate.py:281 step:54K smpl:3M ep:21K epch:12.64 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:40:18 celerate.py:281 step:54K smpl:3M ep:21K epch:12.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.473 data_s:0.038
+INFO 2025-09-08 21:42:02 celerate.py:281 step:54K smpl:3M ep:22K epch:12.73 loss:0.085 grdn:0.232 lr:2.5e-06 updt_s:
+0.408 data_s:0.109
+INFO 2025-09-08 21:43:45 celerate.py:281 step:55K smpl:3M ep:22K epch:12.78 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.377 data_s:0.136
+INFO 2025-09-08 21:45:29 celerate.py:281 step:55K smpl:4M ep:22K epch:12.83 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:
+0.497 data_s:0.022
+INFO 2025-09-08 21:47:12 celerate.py:281 step:55K smpl:4M ep:22K epch:12.87 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 21:48:56 celerate.py:281 step:55K smpl:4M ep:22K epch:12.92 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.429 data_s:0.086
+INFO 2025-09-08 21:50:39 celerate.py:281 step:55K smpl:4M ep:22K epch:12.97 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.454 data_s:0.059
+INFO 2025-09-08 21:52:25 celerate.py:281 step:56K smpl:4M ep:22K epch:13.01 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.459 data_s:0.072
+INFO 2025-09-08 21:54:08 celerate.py:281 step:56K smpl:4M ep:22K epch:13.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.382 data_s:0.132
+INFO 2025-09-08 21:55:51 celerate.py:281 step:56K smpl:4M ep:22K epch:13.11 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.500 data_s:0.016
+INFO 2025-09-08 21:57:36 celerate.py:281 step:56K smpl:4M ep:22K epch:13.15 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:59:20 celerate.py:281 step:56K smpl:4M ep:22K epch:13.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 22:01:03 celerate.py:281 step:57K smpl:4M ep:22K epch:13.25 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-08 22:02:48 celerate.py:281 step:57K smpl:4M ep:23K epch:13.29 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 22:04:32 celerate.py:281 step:57K smpl:4M ep:23K epch:13.34 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:06:14 celerate.py:281 step:57K smpl:4M ep:23K epch:13.39 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s:
+0.505 data_s:0.005
+bINFO 2025-09-08 22:07:59 celerate.py:281 step:57K smpl:4M ep:23K epch:13.43 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s
+:0.496 data_s:0.026
+INFO 2025-09-08 22:09:43 celerate.py:281 step:58K smpl:4M ep:23K epch:13.48 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.438 data_s:0.080
+INFO 2025-09-08 22:11:27 celerate.py:281 step:58K smpl:4M ep:23K epch:13.53 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
+0.444 data_s:0.073
+INFO 2025-09-08 22:13:11 celerate.py:281 step:58K smpl:4M ep:23K epch:13.57 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:14:55 celerate.py:281 step:58K smpl:4M ep:23K epch:13.62 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 22:16:39 celerate.py:281 step:58K smpl:4M ep:23K epch:13.67 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 22:18:22 celerate.py:281 step:59K smpl:4M ep:23K epch:13.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 22:20:05 celerate.py:281 step:59K smpl:4M ep:23K epch:13.76 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-08 22:21:48 celerate.py:281 step:59K smpl:4M ep:23K epch:13.81 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:
+0.505 data_s:0.008
+INFO 2025-09-08 22:23:31 celerate.py:281 step:59K smpl:4M ep:23K epch:13.85 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.505 data_s:0.008
+INFO 2025-09-08 22:25:15 celerate.py:281 step:59K smpl:4M ep:24K epch:13.90 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:26:58 celerate.py:281 step:60K smpl:4M ep:24K epch:13.95 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.493 data_s:0.022
+INFO 2025-09-08 22:28:42 celerate.py:281 step:60K smpl:4M ep:24K epch:14.00 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:30:29 celerate.py:281 step:60K smpl:4M ep:24K epch:14.04 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.435 data_s:0.101
+INFO 2025-09-08 22:30:29 celerate.py:295 Checkpoint policy after step 60000
+INFO 2025-09-08 22:32:14 celerate.py:281 step:60K smpl:4M ep:24K epch:14.09 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-08 22:33:58 celerate.py:281 step:60K smpl:4M ep:24K epch:14.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 22:35:42 celerate.py:281 step:61K smpl:4M ep:24K epch:14.18 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:37:25 celerate.py:281 step:61K smpl:4M ep:24K epch:14.23 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 22:39:09 celerate.py:281 step:61K smpl:4M ep:24K epch:14.28 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 22:40:52 celerate.py:281 step:61K smpl:4M ep:24K epch:14.32 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-08 22:42:35 celerate.py:281 step:61K smpl:4M ep:24K epch:14.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 22:44:18 celerate.py:281 step:62K smpl:4M ep:24K epch:14.42 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 22:46:02 celerate.py:281 step:62K smpl:4M ep:24K epch:14.46 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:47:47 celerate.py:281 step:62K smpl:4M ep:25K epch:14.51 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 22:49:30 celerate.py:281 step:62K smpl:4M ep:25K epch:14.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 22:51:14 celerate.py:281 step:62K smpl:4M ep:25K epch:14.60 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 22:52:58 celerate.py:281 step:63K smpl:4M ep:25K epch:14.65 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.484 data_s:0.033
+INFO 2025-09-08 22:54:41 celerate.py:281 step:63K smpl:4M ep:25K epch:14.70 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.501 data_s:0.016
+INFO 2025-09-08 22:56:25 celerate.py:281 step:63K smpl:4M ep:25K epch:14.74 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.436 data_s:0.081
+INFO 2025-09-08 22:58:08 celerate.py:281 step:63K smpl:4M ep:25K epch:14.79 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.436 data_s:0.080
+INFO 2025-09-08 22:59:51 celerate.py:281 step:63K smpl:4M ep:25K epch:14.84 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.344 data_s:0.168
+INFO 2025-09-08 23:01:34 celerate.py:281 step:64K smpl:4M ep:25K epch:14.88 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.479 data_s:0.035
+INFO 2025-09-08 23:03:18 celerate.py:281 step:64K smpl:4M ep:25K epch:14.93 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 23:05:02 celerate.py:281 step:64K smpl:4M ep:25K epch:14.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 23:06:47 celerate.py:281 step:64K smpl:4M ep:25K epch:15.02 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.405 data_s:0.119
+INFO 2025-09-08 23:08:30 celerate.py:281 step:64K smpl:4M ep:26K epch:15.07 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s:
+0.393 data_s:0.121
+INFO 2025-09-08 23:10:13 celerate.py:281 step:65K smpl:4M ep:26K epch:15.12 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
+0.369 data_s:0.142
+INFO 2025-09-08 23:11:56 celerate.py:281 step:65K smpl:4M ep:26K epch:15.17 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s:
+0.360 data_s:0.156
+INFO 2025-09-08 23:13:40 celerate.py:281 step:65K smpl:4M ep:26K epch:15.21 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.333 data_s:0.182
+INFO 2025-09-08 23:15:23 celerate.py:281 step:65K smpl:4M ep:26K epch:15.26 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.376 data_s:0.136
+INFO 2025-09-08 23:17:05 celerate.py:281 step:65K smpl:4M ep:26K epch:15.31 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.439 data_s:0.069
+INFO 2025-09-08 23:18:49 celerate.py:281 step:66K smpl:4M ep:26K epch:15.35 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.512 data_s:0.006
+INFO 2025-09-08 23:20:32 celerate.py:281 step:66K smpl:4M ep:26K epch:15.40 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 23:22:15 celerate.py:281 step:66K smpl:4M ep:26K epch:15.45 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.508 data_s:0.004
+INFO 2025-09-08 23:23:59 celerate.py:281 step:66K smpl:4M ep:26K epch:15.49 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.474 data_s:0.046
+INFO 2025-09-08 23:25:42 celerate.py:281 step:66K smpl:4M ep:26K epch:15.54 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-08 23:27:25 celerate.py:281 step:67K smpl:4M ep:26K epch:15.59 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.487 data_s:0.025
+INFO 2025-09-08 23:29:08 celerate.py:281 step:67K smpl:4M ep:26K epch:15.63 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.510 data_s:0.005
+INFO 2025-09-08 23:30:50 celerate.py:281 step:67K smpl:4M ep:27K epch:15.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.506 data_s:0.003
+INFO 2025-09-08 23:32:33 celerate.py:281 step:67K smpl:4M ep:27K epch:15.73 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-08 23:34:16 celerate.py:281 step:67K smpl:4M ep:27K epch:15.77 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s:
+0.503 data_s:0.014
+INFO 2025-09-08 23:35:59 celerate.py:281 step:68K smpl:4M ep:27K epch:15.82 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 23:37:42 celerate.py:281 step:68K smpl:4M ep:27K epch:15.87 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-08 23:39:26 celerate.py:281 step:68K smpl:4M ep:27K epch:15.91 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 23:41:10 celerate.py:281 step:68K smpl:4M ep:27K epch:15.96 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 23:42:56 celerate.py:281 step:68K smpl:4M ep:27K epch:16.01 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.469 data_s:0.061
+INFO 2025-09-08 23:44:40 celerate.py:281 step:69K smpl:4M ep:27K epch:16.05 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.339 data_s:0.179
+INFO 2025-09-08 23:46:22 celerate.py:281 step:69K smpl:4M ep:27K epch:16.10 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.386 data_s:0.124
+INFO 2025-09-08 23:48:06 celerate.py:281 step:69K smpl:4M ep:27K epch:16.15 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 23:49:50 celerate.py:281 step:69K smpl:4M ep:27K epch:16.20 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 23:51:33 celerate.py:281 step:69K smpl:4M ep:27K epch:16.24 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s:
+0.501 data_s:0.014
+INFO 2025-09-08 23:53:17 celerate.py:281 step:70K smpl:4M ep:28K epch:16.29 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 23:54:59 celerate.py:281 step:70K smpl:4M ep:28K epch:16.34 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.507 data_s:0.005
+INFO 2025-09-08 23:56:43 celerate.py:281 step:70K smpl:4M ep:28K epch:16.38 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 23:58:27 celerate.py:281 step:70K smpl:4M ep:28K epch:16.43 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 00:00:12 celerate.py:281 step:70K smpl:5M ep:28K epch:16.48 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-09 00:01:55 celerate.py:281 step:71K smpl:5M ep:28K epch:16.52 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 00:03:37 celerate.py:281 step:71K smpl:5M ep:28K epch:16.57 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-09 00:05:20 celerate.py:281 step:71K smpl:5M ep:28K epch:16.62 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-09 00:07:04 celerate.py:281 step:71K smpl:5M ep:28K epch:16.66 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-09 00:08:47 celerate.py:281 step:71K smpl:5M ep:28K epch:16.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-09 00:10:30 celerate.py:281 step:72K smpl:5M ep:28K epch:16.76 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 00:12:13 celerate.py:281 step:72K smpl:5M ep:28K epch:16.80 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 00:13:58 celerate.py:281 step:72K smpl:5M ep:29K epch:16.85 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.522 data_s:0.003
+INFO 2025-09-09 00:15:40 celerate.py:281 step:72K smpl:5M ep:29K epch:16.90 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.506 data_s:0.003
+INFO 2025-09-09 00:17:23 celerate.py:281 step:72K smpl:5M ep:29K epch:16.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.510 data_s:0.004
+INFO 2025-09-09 00:19:05 celerate.py:281 step:73K smpl:5M ep:29K epch:16.99 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.505 data_s:0.003
+INFO 2025-09-09 00:20:51 celerate.py:281 step:73K smpl:5M ep:29K epch:17.04 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.437 data_s:0.092
+INFO 2025-09-09 00:22:34 celerate.py:281 step:73K smpl:5M ep:29K epch:17.08 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.489 data_s:0.025
+INFO 2025-09-09 00:24:17 celerate.py:281 step:73K smpl:5M ep:29K epch:17.13 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 00:26:01 celerate.py:281 step:73K smpl:5M ep:29K epch:17.18 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 00:27:45 celerate.py:281 step:74K smpl:5M ep:29K epch:17.22 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 00:29:28 celerate.py:281 step:74K smpl:5M ep:29K epch:17.27 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-09 00:31:11 celerate.py:281 step:74K smpl:5M ep:29K epch:17.32 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 00:32:54 celerate.py:281 step:74K smpl:5M ep:29K epch:17.37 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 00:34:38 celerate.py:281 step:74K smpl:5M ep:29K epch:17.41 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 00:36:21 celerate.py:281 step:75K smpl:5M ep:30K epch:17.46 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 00:38:05 celerate.py:281 step:75K smpl:5M ep:30K epch:17.51 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 00:39:49 celerate.py:281 step:75K smpl:5M ep:30K epch:17.55 loss:0.086 grdn:0.251 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-09 00:41:32 celerate.py:281 step:75K smpl:5M ep:30K epch:17.60 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.506 data_s:0.005
+INFO 2025-09-09 00:43:14 celerate.py:281 step:75K smpl:5M ep:30K epch:17.65 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.390 data_s:0.122
+INFO 2025-09-09 00:44:58 celerate.py:281 step:76K smpl:5M ep:30K epch:17.69 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.410 data_s:0.107
+INFO 2025-09-09 00:46:41 celerate.py:281 step:76K smpl:5M ep:30K epch:17.74 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s:
+0.427 data_s:0.085
+INFO 2025-09-09 00:48:24 celerate.py:281 step:76K smpl:5M ep:30K epch:17.79 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s:
+0.492 data_s:0.025
+INFO 2025-09-09 00:50:08 celerate.py:281 step:76K smpl:5M ep:30K epch:17.83 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 00:51:51 celerate.py:281 step:76K smpl:5M ep:30K epch:17.88 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 00:53:33 celerate.py:281 step:77K smpl:5M ep:30K epch:17.93 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 00:55:18 celerate.py:281 step:77K smpl:5M ep:30K epch:17.97 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-09 00:57:05 celerate.py:281 step:77K smpl:5M ep:31K epch:18.02 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.486 data_s:0.046
+INFO 2025-09-09 00:58:47 celerate.py:281 step:77K smpl:5M ep:31K epch:18.07 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-09 01:00:31 celerate.py:281 step:77K smpl:5M ep:31K epch:18.11 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 01:02:15 celerate.py:281 step:78K smpl:5M ep:31K epch:18.16 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 01:03:56 celerate.py:281 step:78K smpl:5M ep:31K epch:18.21 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.498 data_s:0.008
+INFO 2025-09-09 01:05:40 celerate.py:281 step:78K smpl:5M ep:31K epch:18.25 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.508 data_s:0.010
+INFO 2025-09-09 01:07:24 celerate.py:281 step:78K smpl:5M ep:31K epch:18.30 loss:0.085 grdn:0.240 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 01:09:07 celerate.py:281 step:78K smpl:5M ep:31K epch:18.35 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:10:51 celerate.py:281 step:79K smpl:5M ep:31K epch:18.40 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 01:12:35 celerate.py:281 step:79K smpl:5M ep:31K epch:18.44 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 01:14:17 celerate.py:281 step:79K smpl:5M ep:31K epch:18.49 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-09 01:16:01 celerate.py:281 step:79K smpl:5M ep:31K epch:18.54 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:17:43 celerate.py:281 step:79K smpl:5M ep:31K epch:18.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.507 data_s:0.003
+INFO 2025-09-09 01:19:26 celerate.py:281 step:80K smpl:5M ep:32K epch:18.63 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:21:09 celerate.py:281 step:80K smpl:5M ep:32K epch:18.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:22:52 celerate.py:281 step:80K smpl:5M ep:32K epch:18.72 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.501 data_s:0.011
+INFO 2025-09-09 01:22:52 celerate.py:295 Checkpoint policy after step 80000
+INFO 2025-09-09 01:24:37 celerate.py:281 step:80K smpl:5M ep:32K epch:18.77 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 01:26:20 celerate.py:281 step:80K smpl:5M ep:32K epch:18.82 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 01:28:05 celerate.py:281 step:81K smpl:5M ep:32K epch:18.86 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-09 01:29:48 celerate.py:281 step:81K smpl:5M ep:32K epch:18.91 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:31:32 celerate.py:281 step:81K smpl:5M ep:32K epch:18.96 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-09 01:33:18 celerate.py:281 step:81K smpl:5M ep:32K epch:19.00 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s:
+0.487 data_s:0.042
+INFO 2025-09-09 01:35:01 celerate.py:281 step:81K smpl:5M ep:32K epch:19.05 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.461 data_s:0.054
+INFO 2025-09-09 01:36:46 celerate.py:281 step:82K smpl:5M ep:32K epch:19.10 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.450 data_s:0.071
+INFO 2025-09-09 01:38:29 celerate.py:281 step:82K smpl:5M ep:32K epch:19.14 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.451 data_s:0.064
+INFO 2025-09-09 01:40:12 celerate.py:281 step:82K smpl:5M ep:32K epch:19.19 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 01:41:54 celerate.py:281 step:82K smpl:5M ep:33K epch:19.24 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s:
+0.480 data_s:0.030
+INFO 2025-09-09 01:43:37 celerate.py:281 step:82K smpl:5M ep:33K epch:19.28 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.434 data_s:0.080
+INFO 2025-09-09 01:45:20 celerate.py:281 step:83K smpl:5M ep:33K epch:19.33 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:47:04 celerate.py:281 step:83K smpl:5M ep:33K epch:19.38 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:48:49 celerate.py:281 step:83K smpl:5M ep:33K epch:19.42 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-09 01:50:32 celerate.py:281 step:83K smpl:5M ep:33K epch:19.47 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:52:15 celerate.py:281 step:83K smpl:5M ep:33K epch:19.52 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:53:59 celerate.py:281 step:84K smpl:5M ep:33K epch:19.57 loss:0.085 grdn:0.236 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:55:41 celerate.py:281 step:84K smpl:5M ep:33K epch:19.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-09 01:57:24 celerate.py:281 step:84K smpl:5M ep:33K epch:19.66 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.500 data_s:0.013
+INFO 2025-09-09 01:59:07 celerate.py:281 step:84K smpl:5M ep:33K epch:19.71 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 02:00:50 celerate.py:281 step:84K smpl:5M ep:33K epch:19.75 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 02:02:34 celerate.py:281 step:85K smpl:5M ep:34K epch:19.80 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-09 02:04:17 celerate.py:281 step:85K smpl:5M ep:34K epch:19.85 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 02:05:59 celerate.py:281 step:85K smpl:5M ep:34K epch:19.89 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.507 data_s:0.004
+INFO 2025-09-09 02:07:43 celerate.py:281 step:85K smpl:5M ep:34K epch:19.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.505 data_s:0.011
+INFO 2025-09-09 02:09:26 celerate.py:281 step:85K smpl:5M ep:34K epch:19.99 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.392 data_s:0.124
+INFO 2025-09-09 02:11:14 celerate.py:281 step:86K smpl:5M ep:34K epch:20.03 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.435 data_s:0.100
+INFO 2025-09-09 02:12:57 celerate.py:281 step:86K smpl:5M ep:34K epch:20.08 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 02:14:41 celerate.py:281 step:86K smpl:6M ep:34K epch:20.13 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-09 02:16:25 celerate.py:281 step:86K smpl:6M ep:34K epch:20.17 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 02:18:09 celerate.py:281 step:86K smpl:6M ep:34K epch:20.22 loss:0.085 grdn:0.238 lr:2.5e-06 updt_s:
+0.506 data_s:0.010
+INFO 2025-09-09 02:19:53 celerate.py:281 step:87K smpl:6M ep:34K epch:20.27 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 02:21:35 celerate.py:281 step:87K smpl:6M ep:34K epch:20.31 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.501 data_s:0.007
+INFO 2025-09-09 02:23:17 celerate.py:281 step:87K smpl:6M ep:34K epch:20.36 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s:
+0.432 data_s:0.080
+INFO 2025-09-09 02:25:01 celerate.py:281 step:87K smpl:6M ep:35K epch:20.41 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.445 data_s:0.073
+INFO 2025-09-09 02:26:43 celerate.py:281 step:87K smpl:6M ep:35K epch:20.45 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
+0.401 data_s:0.107
+INFO 2025-09-09 02:28:26 celerate.py:281 step:88K smpl:6M ep:35K epch:20.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.502 data_s:0.009
+INFO 2025-09-09 02:30:09 celerate.py:281 step:88K smpl:6M ep:35K epch:20.55 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s:
+0.491 data_s:0.021
+INFO 2025-09-09 02:31:52 celerate.py:281 step:88K smpl:6M ep:35K epch:20.59 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.391 data_s:0.124
+INFO 2025-09-09 02:33:35 celerate.py:281 step:88K smpl:6M ep:35K epch:20.64 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.332 data_s:0.180
+INFO 2025-09-09 02:35:18 celerate.py:281 step:88K smpl:6M ep:35K epch:20.69 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:
+0.333 data_s:0.181
+INFO 2025-09-09 02:37:02 celerate.py:281 step:89K smpl:6M ep:35K epch:20.74 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s:
+0.332 data_s:0.185
+INFO 2025-09-09 02:38:47 celerate.py:281 step:89K smpl:6M ep:35K epch:20.78 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.332 data_s:0.190
+INFO 2025-09-09 02:40:30 celerate.py:281 step:89K smpl:6M ep:35K epch:20.83 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.330 data_s:0.187
+INFO 2025-09-09 02:42:12 celerate.py:281 step:89K smpl:6M ep:35K epch:20.88 loss:0.085 grdn:0.243 lr:2.5e-06 updt_s:
+0.331 data_s:0.177
+INFO 2025-09-09 02:43:56 celerate.py:281 step:89K smpl:6M ep:35K epch:20.92 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.330 data_s:0.185
+INFO 2025-09-09 02:45:39 celerate.py:281 step:90K smpl:6M ep:36K epch:20.97 loss:0.087 grdn:0.252 lr:2.5e-06 updt_s:
+0.335 data_s:0.182
+INFO 2025-09-09 02:47:25 celerate.py:281 step:90K smpl:6M ep:36K epch:21.02 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s:
+0.330 data_s:0.197
+INFO 2025-09-09 02:49:08 celerate.py:281 step:90K smpl:6M ep:36K epch:21.06 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.330 data_s:0.182
+INFO 2025-09-09 02:50:51 celerate.py:281 step:90K smpl:6M ep:36K epch:21.11 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.334 data_s:0.181
+INFO 2025-09-09 02:52:34 celerate.py:281 step:90K smpl:6M ep:36K epch:21.16 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.330 data_s:0.183
+INFO 2025-09-09 02:54:17 celerate.py:281 step:91K smpl:6M ep:36K epch:21.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s:
+0.342 data_s:0.173
+INFO 2025-09-09 02:56:01 celerate.py:281 step:91K smpl:6M ep:36K epch:21.25 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s:
+0.331 data_s:0.189
+INFO 2025-09-09 02:57:44 celerate.py:281 step:91K smpl:6M ep:36K epch:21.30 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.329 data_s:0.182
+INFO 2025-09-09 02:59:27 celerate.py:281 step:91K smpl:6M ep:36K epch:21.34 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s:
+0.341 data_s:0.175
+INFO 2025-09-09 03:01:11 celerate.py:281 step:91K smpl:6M ep:36K epch:21.39 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.330 data_s:0.188
+INFO 2025-09-09 03:02:54 celerate.py:281 step:92K smpl:6M ep:36K epch:21.44 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.328 data_s:0.186
+INFO 2025-09-09 03:04:39 celerate.py:281 step:92K smpl:6M ep:36K epch:21.48 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.329 data_s:0.195
+INFO 2025-09-09 03:06:22 celerate.py:281 step:92K smpl:6M ep:36K epch:21.53 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.330 data_s:0.183
+INFO 2025-09-09 03:08:04 celerate.py:281 step:92K smpl:6M ep:37K epch:21.58 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.462 data_s:0.051
+INFO 2025-09-09 03:09:48 celerate.py:281 step:92K smpl:6M ep:37K epch:21.62 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s:
+0.407 data_s:0.108
+INFO 2025-09-09 03:11:32 celerate.py:281 step:93K smpl:6M ep:37K epch:21.67 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.333 data_s:0.185
+INFO 2025-09-09 03:13:15 celerate.py:281 step:93K smpl:6M ep:37K epch:21.72 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s:
+0.329 data_s:0.187
+INFO 2025-09-09 03:14:58 celerate.py:281 step:93K smpl:6M ep:37K epch:21.77 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.357 data_s:0.156
+INFO 2025-09-09 03:16:41 celerate.py:281 step:93K smpl:6M ep:37K epch:21.81 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.487 data_s:0.027
+INFO 2025-09-09 03:18:25 celerate.py:281 step:93K smpl:6M ep:37K epch:21.86 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 03:20:08 celerate.py:281 step:94K smpl:6M ep:37K epch:21.91 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 03:21:51 celerate.py:281 step:94K smpl:6M ep:37K epch:21.95 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.508 data_s:0.004
+INFO 2025-09-09 03:23:38 celerate.py:281 step:94K smpl:6M ep:37K epch:22.00 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
+0.429 data_s:0.104
+INFO 2025-09-09 03:25:20 celerate.py:281 step:94K smpl:6M ep:37K epch:22.05 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.328 data_s:0.183
+INFO 2025-09-09 03:27:04 celerate.py:281 step:94K smpl:6M ep:37K epch:22.09 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s:
+0.329 data_s:0.191
+INFO 2025-09-09 03:28:47 celerate.py:281 step:95K smpl:6M ep:37K epch:22.14 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s:
+0.329 data_s:0.186
+INFO 2025-09-09 03:30:30 celerate.py:281 step:95K smpl:6M ep:38K epch:22.19 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.346 data_s:0.166
+INFO 2025-09-09 03:32:13 celerate.py:281 step:95K smpl:6M ep:38K epch:22.23 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s:
+0.334 data_s:0.181
+INFO 2025-09-09 03:33:56 celerate.py:281 step:95K smpl:6M ep:38K epch:22.28 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.402 data_s:0.110
+INFO 2025-09-09 03:35:39 celerate.py:281 step:95K smpl:6M ep:38K epch:22.33 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s:
+0.353 data_s:0.161
+INFO 2025-09-09 03:37:22 celerate.py:281 step:96K smpl:6M ep:38K epch:22.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.356 data_s:0.158
+INFO 2025-09-09 03:39:04 celerate.py:281 step:96K smpl:6M ep:38K epch:22.42 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.379 data_s:0.131
+INFO 2025-09-09 03:40:49 celerate.py:281 step:96K smpl:6M ep:38K epch:22.47 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
+0.344 data_s:0.175
+INFO 2025-09-09 03:42:32 celerate.py:281 step:96K smpl:6M ep:38K epch:22.51 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.331 data_s:0.185
+INFO 2025-09-09 03:44:15 celerate.py:281 step:96K smpl:6M ep:38K epch:22.56 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.331 data_s:0.183
+INFO 2025-09-09 03:45:58 celerate.py:281 step:97K smpl:6M ep:38K epch:22.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.330 data_s:0.184
+INFO 2025-09-09 03:47:43 celerate.py:281 step:97K smpl:6M ep:38K epch:22.65 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s:
+0.331 data_s:0.188
+INFO 2025-09-09 03:49:27 celerate.py:281 step:97K smpl:6M ep:38K epch:22.70 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.333 data_s:0.185
+INFO 2025-09-09 03:51:10 celerate.py:281 step:97K smpl:6M ep:39K epch:22.75 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s:
+0.330 data_s:0.185
+INFO 2025-09-09 03:52:54 celerate.py:281 step:97K smpl:6M ep:39K epch:22.79 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s:
+0.330 data_s:0.192
+INFO 2025-09-09 03:54:37 celerate.py:281 step:98K smpl:6M ep:39K epch:22.84 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.329 data_s:0.185
+INFO 2025-09-09 03:56:21 celerate.py:281 step:98K smpl:6M ep:39K epch:22.89 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.329 data_s:0.187
+INFO 2025-09-09 03:58:04 celerate.py:281 step:98K smpl:6M ep:39K epch:22.94 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.329 data_s:0.185
+INFO 2025-09-09 03:59:46 celerate.py:281 step:98K smpl:6M ep:39K epch:22.98 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.329 data_s:0.183
+INFO 2025-09-09 04:01:32 celerate.py:281 step:98K smpl:6M ep:39K epch:23.03 loss:0.087 grdn:0.250 lr:2.5e-06 updt_s:
+0.376 data_s:0.151
+INFO 2025-09-09 04:03:16 celerate.py:281 step:99K smpl:6M ep:39K epch:23.08 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.329 data_s:0.187
+INFO 2025-09-09 04:04:59 celerate.py:281 step:99K smpl:6M ep:39K epch:23.12 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.379 data_s:0.136
+INFO 2025-09-09 04:06:42 celerate.py:281 step:99K smpl:6M ep:39K epch:23.17 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 04:08:25 celerate.py:281 step:99K smpl:6M ep:39K epch:23.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 04:10:07 celerate.py:281 step:99K smpl:6M ep:39K epch:23.26 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
+0.470 data_s:0.039
+INFO 2025-09-09 04:11:50 celerate.py:281 step:100K smpl:6M ep:39K epch:23.31 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s
+:0.347 data_s:0.169
+INFO 2025-09-09 04:13:34 celerate.py:281 step:100K smpl:6M ep:40K epch:23.36 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s
+:0.330 data_s:0.185
+INFO 2025-09-09 04:15:17 celerate.py:281 step:100K smpl:6M ep:40K epch:23.40 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s
+:0.356 data_s:0.156
+INFO 2025-09-09 04:15:17 celerate.py:295 Checkpoint policy after step 100000
+INFO 2025-09-09 04:15:18 celerate.py:359 End of training
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ tmux capture-pane -pS - > tmux_log.txt
+
+
+
+
+
+
+
+
+
+