diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh index 36f8c6473..ad6ca0f13 100644 --- a/examples/6_evaluate_libero.sh +++ b/examples/6_evaluate_libero.sh @@ -1,14 +1,45 @@ #!/bin/bash -unset LEROBOT_HOME -unset HF_LEROBOT_HOME +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +export HF_DATASETS_OFFLINE=1 +export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl +export CUDA_VISIBLE_DEVICES=3 + # CONFIGURATION -POLICY_PATH="bicmol/smolvla-libero" +POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model" +POLICY_PATH="/raid/jade/models/smolvlamust" TASK=libero_spatial ENV_TYPE="libero" -BATCH_SIZE=1 -N_EPISODES=1 +BATCH_SIZE=10 +N_EPISODES=10 +# storage / caches +RAID=/raid/jade +N_ACTION_STEPS=1 +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +# export HF_DATASETS_OFFLINE=1 +# export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false export MUJOCO_GL=egl +export MUJOCO_GL=egl +unset HF_HUB_OFFLINE # RUN EVALUATION python src/lerobot/scripts/eval.py \ --policy.path="$POLICY_PATH" \ @@ -17,3 +48,11 @@ python src/lerobot/scripts/eval.py \ --eval.n_episodes="$N_EPISODES" \ --env.multitask_eval=False \ --env.task=$TASK \ +# python examples/evaluate_libero.py \ +# --policy_path "$POLICY_PATH" \ +# --task_suite_name "$TASK" \ +# --num_steps_wait 10 \ +# --num_trials_per_task 10 \ +# --video_out_path "data/libero/videos" \ +# --device "cuda" \ +# --seed 7 \ No newline at end of file diff --git a/examples/6_evaluate_libero_2.sh b/examples/6_evaluate_libero_2.sh new file mode 100644 index 000000000..9d05c0330 --- /dev/null +++ b/examples/6_evaluate_libero_2.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +export HF_DATASETS_OFFLINE=1 +export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl +export CUDA_VISIBLE_DEVICES=3 + +# CONFIGURATION +POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model" +POLICY_PATH="AustineJohnBreaker/smolvla_stratch_libero_spatial" +TASK=libero_spatial +ENV_TYPE="libero" +BATCH_SIZE=10 +N_EPISODES=10 +USE_AMP=false +N_ACTION_STEPS=1 +SELF_ATTN_EVERY_N_LAYERS=2 +VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct +PAD_LANG_TO=longest +LOAD_VLM_WEIGHTS=true +NUM_VLM_LAYERS=16 +CHUNK_SIZE=50 +N_OBS_STEPS=1 +NUM_EXPERT_LAYERS=0 +EXPERT_WIDTH_MULTIPLIER=0.5 + + +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +# export HF_DATASETS_OFFLINE=1 +# export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl +export MUJOCO_GL=egl +ADD_IMAGE_TOKENS=true +unset HF_HUB_OFFLINE +# RUN EVALUATION +python src/lerobot/scripts/eval.py \ + --policy.path="$POLICY_PATH" \ + --env.type="$ENV_TYPE" \ + --eval.batch_size="$BATCH_SIZE" \ + --eval.n_episodes="$N_EPISODES" \ + --env.multitask_eval=False \ + --env.task=$TASK \ + --policy.use_amp=$USE_AMP \ + --policy.n_action_steps=$N_ACTION_STEPS \ + # --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \ + --policy.attention_mode=$ATTN_MODE \ + --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ + --policy.vlm_model_name=$VLM_NAME \ + --policy.pad_language_to=$PAD_LANG_TO \ + --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ + --policy.num_vlm_layers=$NUM_VLM_LAYERS \ + --policy.chunk_size=$CHUNK_SIZE \ + --policy.n_obs_steps=$N_OBS_STEPS \ + --policy.num_expert_layers=$NUM_EXPERT_LAYERS \ + --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ diff --git a/examples/7_train_acc.sh b/examples/7_train_acc.sh new file mode 100644 index 000000000..27f445143 --- /dev/null +++ b/examples/7_train_acc.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# smolvla training with accelerate + +set -euo pipefail + +# repo/env +cd ~/lerobot || exit 1 +# conda activate lerobot +export LC_ALL=C + +rm -f core-* + +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +export HF_DATASETS_OFFLINE=1 +export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl + +# CONFIG +ENV=libero +TASK=libero_spatial +REPO_ID=physical-intelligence/libero + +POLICY=smolvla +VLM=HuggingFaceTB/SmolVLM2-500M-Instruct + +# Optim / scheduling +LR=1e-4 +DECAY_LR=2.5e-6 +DECAY_STEPS=30000 +USE_AMP=true # set to true for mixed precision +TRAIN_EXPERT_ONLY=true +N_ACTION_STEPS=1 +SEED=1000 + +# Training loop +OFFLINE_STEPS=100000 +BATCH_SIZE=32 +EVAL_FREQ=0 +SAVE_FREQ=20000 +EVAL_BATCH_SIZE=1 +NUM_EPISODES=1 + +# number of gpus to use +NUM_PROCESSES=2 +export CUDA_VISIBLE_DEVICES=1,3 +PORT=29522 + +# naming/output dir +TRAIN_DIR=$RAID/logs/lerobot/lerobot_2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} +echo "Training dir: $TRAIN_DIR" + +rm -rf "$TRAIN_DIR" + +# RUN +python -m accelerate.commands.launch \ + --num_processes $NUM_PROCESSES \ + --num_machines 1 \ + --main_process_port $PORT \ + --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \ + src/lerobot/scripts/train_accelerate.py \ + --policy.type=$POLICY \ + --policy.use_amp=True \ + --policy.vlm_model_name=$VLM \ + --dataset.repo_id=$REPO_ID \ + --dataset.root=$HF_DATASETS_CACHE \ + --env.type=$ENV \ + --env.task=$TASK \ + --output_dir=$TRAIN_DIR \ + --batch_size=$BATCH_SIZE \ + --steps=$OFFLINE_STEPS \ + --eval_freq=$EVAL_FREQ \ + --save_freq=$SAVE_FREQ \ + --eval.batch_size=$EVAL_BATCH_SIZE \ + --eval.n_episodes=$NUM_EPISODES \ + --policy.optimizer_lr=$LR \ + --policy.repo_id=None \ + --policy.scheduler_decay_lr=$DECAY_LR \ + --policy.scheduler_decay_steps=$DECAY_STEPS \ + --policy.n_action_steps=$N_ACTION_STEPS \ + --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ + --policy.vlm_model_name=$VLM \ + --seed=$SEED \ + --wandb.enable=false diff --git a/examples/7_train_libero_smolvla.sh b/examples/7_train_libero_smolvla.sh index 3943e3c96..f0b9de4e5 100644 --- a/examples/7_train_libero_smolvla.sh +++ b/examples/7_train_libero_smolvla.sh @@ -21,8 +21,8 @@ export WANDB_CACHE_DIR=$RAID/.cache/wandb export TMPDIR=$RAID/.cache/tmp mkdir -p $TMPDIR export WANDB_MODE=offline -export HF_DATASETS_OFFLINE=1 -export HF_HUB_OFFLINE=1 +# export HF_DATASETS_OFFLINE=1 +# export HF_HUB_OFFLINE=1 export TOKENIZERS_PARALLELISM=false export MUJOCO_GL=egl @@ -31,11 +31,11 @@ PORT=29522 # =================== CONFIG =================== ENV=libero -TASK=libero_spatial +TASK=libero_object REPO_ID=physical-intelligence/libero - +ROOT=$RAID POLICY=smolvla -VLM=HuggingFaceTB/SmolVLM2-2.2B-Instruct +VLM=HuggingFaceTB/SmolVLM2-500M-Instruct # Optim / scheduling LR=1e-4 @@ -55,10 +55,10 @@ EVAL_BATCH_SIZE=1 NUM_EPISODES=1 # GPU selection 0, 1, 2, 3 -export CUDA_VISIBLE_DEVICES=1 +export CUDA_VISIBLE_DEVICES=0 # naming/output dir -TRAIN_DIR=$RAID/logs/lerobot/lerobot_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} +TRAIN_DIR=$RAID/logs/lerobot/lerobot_solo_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} echo "Training dir: $TRAIN_DIR" # train @@ -68,7 +68,6 @@ python src/lerobot/scripts/train.py \ --policy.type=$POLICY \ --policy.vlm_model_name=$VLM \ --dataset.repo_id=$REPO_ID \ - --dataset.root=$HF_DATASETS_CACHE \ --env.type=$ENV \ --env.task=$TASK \ --output_dir=$TRAIN_DIR \ @@ -85,6 +84,6 @@ python src/lerobot/scripts/train.py \ --policy.scheduler_decay_steps=$DECAY_STEPS \ --policy.n_action_steps=$N_ACTION_STEPS \ --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ - --policy.vlm_model_name=/raid/jade/.cache/huggingface/models/SmolVLM2-2.2B-Instruct \ + --policy.vlm_model_name=$VLM \ --seed=$SEED \ --wandb.enable=false diff --git a/examples/8_train_smolvla_must.sh b/examples/8_train_smolvla_must.sh new file mode 100644 index 000000000..98029f3ad --- /dev/null +++ b/examples/8_train_smolvla_must.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# smolvla training with accelerate + +set -euo pipefail + +# repo/env +cd ~/lerobot || exit 1 +# conda activate lerobot +export LC_ALL=C + +rm -f core-* + +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +# export HF_DATASETS_OFFLINE=1 +# export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl + +# CONFIG +ENV=libero +TASK=libero_spatial +REPO_ID=HuggingfaceVLA/libero + +POLICY=smolvla +VLM=HuggingFaceTB/SmolVLM2-500M-Instruct + +# Optim / scheduling +LR=1e-4 +DECAY_LR=2.5e-6 +DECAY_STEPS=30000 +USE_AMP=true # set to true for mixed precision +TRAIN_EXPERT_ONLY=true +N_ACTION_STEPS=1 +SEED=1000 +LOAD_VLM_WEIGHTS=true +# Training loop +OFFLINE_STEPS=100000 +BATCH_SIZE=32 +EVAL_FREQ=0 +SAVE_FREQ=20000 +EVAL_BATCH_SIZE=1 +NUM_EPISODES=1 +ADD_IMAGE_TOKENS=tru +N_OBS_STEPS=1 +ATTN_MODE=cross_attn +EXPERT_WIDTH_MULTIPLIER=0.5 +# number of gpus to use +NUM_PROCESSES=2 +NUM_VLM_LAYERS=0 +SELF_ATTN_EVERY_N_LAYERS=2 +CHUNK_SIZE=50 +export CUDA_VISIBLE_DEVICES=0 +PORT=29522 +PREFIX_LENGTH=0 +LOAD_VLM_WEIGHTS=true +# naming/output dir +TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} +echo "Training dir: $TRAIN_DIR" + +rm -rf "$TRAIN_DIR" + +# RUN +# python -m accelerate.commands.launch \ +# --num_processes $NUM_PROCESSES \ +# --num_machines 1 \ +# --main_process_port $PORT \ +# --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \ +# src/lerobot/scripts/train_accelerate.py \ +# --policy.type=$POLICY \ +# --policy.use_amp=True \ +# --policy.vlm_model_name=$VLM \ +# --dataset.repo_id=$REPO_ID \ +# --dataset.root=$HF_DATASETS_CACHE \ +# --env.type=$ENV \ +# --env.task=$TASK \ +# --output_dir=$TRAIN_DIR \ +# --batch_size=$BATCH_SIZE \ +# --steps=$OFFLINE_STEPS \ +# --eval_freq=$EVAL_FREQ \ +# --save_freq=$SAVE_FREQ \ +# --eval.batch_size=$EVAL_BATCH_SIZE \ +# --eval.n_episodes=$NUM_EPISODES \ +# --policy.optimizer_lr=$LR \ +# --policy.repo_id=None \ +# --policy.scheduler_decay_lr=$DECAY_LR \ +# --policy.scheduler_decay_steps=$DECAY_STEPS \ +# --policy.n_action_steps=$N_ACTION_STEPS \ +# --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ +# --policy.vlm_model_name=$VLM \ +# --policy.n_obs_steps=$N_OBS_STEPS \ +# --policy.attention_mode=$ATTN_MODE \ +# --policy.prefix_length=$PREFIX_LENGTH \ +# --policy.num_vlm_layers=$NUM_VLM_LAYERS \ +# --policy.chunk_size=$CHUNK_SIZE \ +# --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ +# --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ +# --seed=$SEED \ +# --wandb.enable=false + + +python src/lerobot/scripts/train.py \ + --policy.type=$POLICY \ + --policy.use_amp=False \ + --policy.vlm_model_name=$VLM \ + --dataset.repo_id=$REPO_ID \ + --dataset.root='/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data' \ + --env.type=$ENV \ + --env.task=$TASK \ + --output_dir=$TRAIN_DIR \ + --batch_size=$BATCH_SIZE \ + --steps=$OFFLINE_STEPS \ + --eval_freq=$EVAL_FREQ \ + --save_freq=$SAVE_FREQ \ + --eval.batch_size=$EVAL_BATCH_SIZE \ + --eval.n_episodes=$NUM_EPISODES \ + --policy.optimizer_lr=$LR \ + --policy.repo_id=None \ + --policy.scheduler_decay_lr=$DECAY_LR \ + --policy.scheduler_decay_steps=$DECAY_STEPS \ + --policy.n_action_steps=$N_ACTION_STEPS \ + --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ + --policy.vlm_model_name=$VLM \ + --policy.n_obs_steps=$N_OBS_STEPS \ + --policy.attention_mode=$ATTN_MODE \ + --policy.prefix_length=$PREFIX_LENGTH \ + --policy.num_vlm_layers=$NUM_VLM_LAYERS \ + --policy.chunk_size=$CHUNK_SIZE \ + --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ + --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ + --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ + --seed=$SEED \ + --wandb.enable=false diff --git a/examples/checker.py b/examples/checker.py new file mode 100644 index 000000000..12377e9b9 --- /dev/null +++ b/examples/checker.py @@ -0,0 +1,27 @@ +from huggingface_hub import HfApi +api = HfApi() +# api.upload_large_folder( +# repo_id="HuggingFaceVLA/libero", +# repo_type="dataset", +# folder_path="/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero", +# ) +api.upload_large_folder( + repo_id="HuggingFaceVLA/metaworld_mt50", + repo_type="dataset", + folder_path="/raid/jade/.cache/huggingface/lerobot/metaworld_mt50", +) +# repo_id="HuggingFaceVLA/libero" +# # Upload extra files +# api.upload_file( +# repo_id=repo_id, +# repo_type="dataset", +# path_or_fileobj="/raid/jade/libero_converted/README.md", +# path_in_repo="README.md" +# ) + +# api.upload_folder( +# repo_id=repo_id, +# repo_type="dataset", +# folder_path="/raid/jade/libero_converted/meta", +# path_in_repo="meta" +# ) diff --git a/examples/checker2.py b/examples/checker2.py new file mode 100644 index 000000000..a5825d87f --- /dev/null +++ b/examples/checker2.py @@ -0,0 +1,35 @@ +import pyarrow.parquet as pq + +# # First parquet (cached HF version) +meta1 = pq.read_metadata("/raid/jade/.cache/huggingface/datasets/data/chunk-000/episode_000000.parquet") +meta1 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet") +print("First parquet key_value_metadata:") +print(meta1.metadata) # low-level file metadata +# print() +print("Second") +# Second parquet (your converted version) +meta2 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet") +print("\nSecond parquet key_value_metadata:") +# print(meta2.metadata) + +# from datasets import load_dataset +# root_dir = "/raid/jade/libero_converted" + +# # Load all parquet files under the root_dir recursively +# ds = load_dataset("parquet", data_files=f"{root_dir}/**/*.parquet") + +# print(ds) # prints split info +# print(ds["train"].features) # check schema/features + +# # Peek at one row +# example = ds["train"][0] +# print(example.keys()) +# print(type(example["observation.images.image"])) +# print(type(example["observation.images.image2"])) + +import pyarrow.parquet as pq + +for ep in ["episode_000019.parquet", "episode_000021.parquet", "episode_000026.parquet"]: + path = f"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/{ep}" + schema = pq.read_schema(path) + print(ep, schema.names) diff --git a/examples/convert_data.py b/examples/convert_data.py new file mode 100644 index 000000000..96ce58cb1 --- /dev/null +++ b/examples/convert_data.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +""" +Convert local LeRobot datasets from v2.0 to v2.1 format. +This script adapts the official converter to work with local datasets. +""" + +import sys +import argparse +import logging +from pathlib import Path + +# Add lerobot to path +sys.path.insert(0, '/home/jade_choghari/lerobot/src') + +from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset +from lerobot.datasets.utils import EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info +from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def convert_local_dataset( + dataset_path: str, + num_workers: int = 4, + skip_if_converted: bool = True +): + """ + Convert a local dataset from v2.0 to v2.1 format. + + Args: + dataset_path: Path to the local dataset directory + num_workers: Number of workers for parallel processing + skip_if_converted: Skip if already has episodes_stats.jsonl + """ + dataset_path = Path(dataset_path) + + print(f"šŸ”„ Converting local dataset: {dataset_path}") + + # Check if already converted + episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl" + if episodes_stats_path.exists() and skip_if_converted: + # Check if file is empty + file_size = episodes_stats_path.stat().st_size + if file_size == 0: + print(f" āš ļø episodes_stats.jsonl is empty, will regenerate") + else: + # Check if file has content + with open(episodes_stats_path, 'r') as f: + content = f.read().strip() + if not content: + print(f" āš ļø episodes_stats.jsonl has no content, will regenerate") + else: + print(f" ā­ļø Already has episodes_stats.jsonl, skipping") + return True + + try: + # Check if this is a v2.0 dataset that needs conversion + episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl" + stats_path = dataset_path / "meta" / "stats.json" + + if not episodes_stats_path.exists() and stats_path.exists(): + print(f" šŸ”„ Detected v2.0 dataset, creating temporary episodes_stats.jsonl...") + # Create empty episodes_stats.jsonl to allow loading + episodes_stats_path.touch() + created_temp_file = True + else: + created_temp_file = False + + # Load dataset from local path with pyav video backend + print(f" šŸ“‚ Loading dataset from local path...") + # Use a dummy repo_id since we're loading locally + dummy_repo_id = f"{dataset_path.parent.name}/{dataset_path.name}" + dataset = LeRobotDataset( + dummy_repo_id, + root=str(dataset_path), + # video_backend="pyav", + # local_files_only=True + ) + + # Remove temporary file if we created it + if created_temp_file and episodes_stats_path.exists() and episodes_stats_path.stat().st_size == 0: + episodes_stats_path.unlink() + print(f" šŸ—‘ļø Removed temporary episodes_stats.jsonl") + + # Remove existing episodes_stats if present (ensure clean conversion) + episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl" + if episodes_stats_path.exists(): + episodes_stats_path.unlink() + print(f" šŸ—‘ļø Removed existing episodes_stats.jsonl") + + # Check if video directory exists before conversion + videos_dir = dataset_path / "videos" + if not videos_dir.exists(): + print(f" āš ļø No videos directory found - will skip video statistics") + + # Convert stats + print(f" šŸ“Š Computing episode statistics...") + convert_stats(dataset, num_workers=num_workers) + + # Load reference stats for validation if they exist + stats_path = dataset.root / STATS_PATH + if stats_path.exists(): + print(f" āœ… Validating against reference stats...") + try: + ref_stats = load_stats(dataset.root) + check_aggregate_stats(dataset, ref_stats) + print(f" āœ… Stats validation passed!") + except AssertionError as e: + print(f" āš ļø Stats validation failed with minor differences: {e}") + print(f" āš ļø This is likely due to floating-point precision, continuing anyway...") + # Check if the error is just a small numerical difference + if "Max absolute difference:" in str(e) and "Max relative difference:" in str(e): + print(f" āœ… Treating as acceptable numerical precision difference") + else: + raise e + + # Remove old stats.json file + print(f" šŸ—‘ļø Removing old stats.json") + stats_path.unlink() + else: + print(f" āš ļø No reference stats found, skipping validation") + + # Update codebase version + dataset.meta.info["codebase_version"] = CODEBASE_VERSION + write_info(dataset.meta.info, dataset.root) + + print(f" āœ… Successfully converted to v2.1") + return True + + except Exception as e: + print(f" āŒ Failed to convert: {e}") + logger.exception("Conversion failed") + return False + +def convert_multiple_datasets( + base_dirs: list[str], + max_datasets: int = None, + num_workers: int = 4 +): + """Convert multiple datasets from base directories.""" + + datasets_to_convert = [] + + # Scan for datasets needing conversion + for base_dir in base_dirs: + base_path = Path(base_dir) + if not base_path.exists(): + print(f"āš ļø Directory not found: {base_dir}") + continue + + print(f"šŸ” Scanning: {base_dir}") + + # Walk through author/dataset structure + for author_dir in sorted(base_path.iterdir()): + if not author_dir.is_dir(): + continue + + for dataset_dir in sorted(author_dir.iterdir()): + if not dataset_dir.is_dir(): + continue + + # Check if needs conversion + episodes_stats_path = dataset_dir / "meta" / "episodes_stats.jsonl" + info_path = dataset_dir / "meta" / "info.json" + + needs_conversion = False + if info_path.exists(): + if not episodes_stats_path.exists(): + needs_conversion = True + print(f" šŸ“ Found (missing): {author_dir.name}/{dataset_dir.name}") + else: + # Check if episodes_stats file is empty + try: + file_size = episodes_stats_path.stat().st_size + if file_size == 0: + needs_conversion = True + print(f" šŸ“ Found (empty): {author_dir.name}/{dataset_dir.name}") + else: + # Check if file has content + with open(episodes_stats_path, 'r') as f: + content = f.read().strip() + if not content: + needs_conversion = True + print(f" šŸ“ Found (no content): {author_dir.name}/{dataset_dir.name}") + except Exception as e: + # If we can't read the file, consider it needs conversion + needs_conversion = True + print(f" šŸ“ Found (read error): {author_dir.name}/{dataset_dir.name}") + + if needs_conversion: + datasets_to_convert.append(dataset_dir) + + if not datasets_to_convert: + print("šŸŽ‰ No datasets need conversion!") + return + + if max_datasets: + datasets_to_convert = datasets_to_convert[:max_datasets] + + print(f"\nšŸš€ Converting {len(datasets_to_convert)} datasets...") + + successful = 0 + failed = 0 + + for i, dataset_path in enumerate(datasets_to_convert, 1): + print(f"\n[{i}/{len(datasets_to_convert)}] {dataset_path.parent.name}/{dataset_path.name}") + + success = convert_local_dataset(dataset_path, num_workers=num_workers) + if success: + successful += 1 + else: + failed += 1 + + print(f"\nšŸ“Š Conversion Summary:") + print(f" āœ… Successful: {successful}") + print(f" āŒ Failed: {failed}") + print(f" šŸ“ˆ Success rate: {successful}/{len(datasets_to_convert)} ({100*successful/len(datasets_to_convert):.1f}%)") + + +def main(): + parser = argparse.ArgumentParser(description="Convert local LeRobot datasets to v2.1 format") + parser.add_argument("--dataset", type=str, help="Single dataset path to convert") + parser.add_argument("--base-dirs", nargs="+", + default=["/fsx/dana_aubakirova/vla/community_dataset_v1"], + help="Base directories to scan for datasets") + parser.add_argument("--max-datasets", type=int, help="Maximum number of datasets to convert") + parser.add_argument("--num-workers", type=int, default=4, help="Number of workers for stats computation") + parser.add_argument("--all", action="store_true", help="Convert all datasets in base directories") + + args = parser.parse_args() + + if args.dataset: + # Convert single dataset + success = convert_local_dataset(args.dataset, num_workers=args.num_workers) + if success: + print(f"\nšŸŽ‰ Successfully converted: {args.dataset}") + else: + print(f"\nšŸ’„ Failed to convert: {args.dataset}") + sys.exit(1) + + elif args.all: + # Convert all datasets + convert_multiple_datasets( + args.base_dirs, + max_datasets=args.max_datasets, + num_workers=args.num_workers + ) + + else: + parser.print_help() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/convert_libero.py b/examples/convert_libero.py new file mode 100644 index 000000000..7bfc50eae --- /dev/null +++ b/examples/convert_libero.py @@ -0,0 +1,126 @@ +import os +import pyarrow.parquet as pq +import tempfile +import shutil + +# Root directory of converted data +root_dir = "/raid/jade/libero_converted" + +# No renaming +rename_map = { + +} + +# Hugging Face features metadata (constant across all files) +HF_METADATA = { + b"huggingface": b'{"info": {"features": {"observation.images.image": {"_type": "Image"}, "observation.images.image2": {"_type": "Image"}, "state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, "actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, "timestamp": {"dtype": "float32", "_type": "Value"}, "frame_index": {"dtype": "int64", "_type": "Value"}, "episode_index": {"dtype": "int64", "_type": "Value"}, "index": {"dtype": "int64", "_type": "Value"}, "task_index": {"dtype": "int64", "_type": "Value"}}}}' +} + +def patch_parquet(parquet_path, hf_metadata): + try: + table = pq.read_table(parquet_path) + + # Merge metadata + new_meta = dict(table.schema.metadata or {}) + new_meta.update(hf_metadata) + + # Apply metadata to table + table = table.replace_schema_metadata(new_meta) + + # Write safely via temp file + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") + os.close(tmp_fd) + pq.write_table(table, tmp_path) + shutil.move(tmp_path, parquet_path) + + print(f"āœ… Patched: {parquet_path}") + return True + except Exception as e: + print(f"āŒ Failed on {parquet_path}: {e}") + return False + +# Walk through all chunk dirs and patch parquet files +for dirpath, _, filenames in os.walk(root_dir): + for fname in filenames: + if fname.endswith(".parquet"): + fpath = os.path.join(dirpath, fname) + patch_parquet(fpath, HF_METADATA)#!/usr/bin/env python3 + +#!/usr/bin/env python3 +import os +import pyarrow.parquet as pq +import tempfile +import shutil + +# Explicit list of files to patch +FILES_TO_PATCH = [ + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet", +] + +# Optional renaming map (fill in as needed) +rename_map = { + # "old_column_name": "new_column_name", + "image": "observation.images.image", + "image2": "observation.images.image2", + "actions": "action", +} + +# Hugging Face features metadata (constant across all files) +HF_METADATA = { + b"huggingface": b'{"info": {"features": {' + b'"observation.images.image": {"_type": "Image"}, ' + b'"observation.images.image2": {"_type": "Image"}, ' + b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, ' + b'"actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, ' + b'"timestamp": {"dtype": "float32", "_type": "Value"}, ' + b'"frame_index": {"dtype": "int64", "_type": "Value"}, ' + b'"episode_index": {"dtype": "int64", "_type": "Value"}, ' + b'"index": {"dtype": "int64", "_type": "Value"}, ' + b'"task_index": {"dtype": "int64", "_type": "Value"}}}}' +} + +def patch_parquet(parquet_path, hf_metadata, rename_map): + try: + # Load parquet table + table = pq.read_table(parquet_path) + + # If renaming is needed + if rename_map: + schema = table.schema + new_names = [ + rename_map.get(name, name) for name in schema.names + ] + table = table.rename_columns(new_names) + + # Merge schema metadata + new_meta = dict(table.schema.metadata or {}) + new_meta.update(hf_metadata) + + # Replace metadata in table + table = table.replace_schema_metadata(new_meta) + + # Write safely via temp file + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") + os.close(tmp_fd) + pq.write_table(table, tmp_path) + + # Replace original file + shutil.move(tmp_path, parquet_path) + + print(f"āœ… Patched: {parquet_path}") + return True + except Exception as e: + print(f"āŒ Failed on {parquet_path}: {e}") + return False + + +if __name__ == "__main__": + for fpath in FILES_TO_PATCH: + if os.path.exists(fpath): + patch_parquet(fpath, HF_METADATA, rename_map) + else: + print(f"āš ļø File not found: {fpath}") diff --git a/examples/evaluate_libero.py b/examples/evaluate_libero.py new file mode 100644 index 000000000..bf99994b6 --- /dev/null +++ b/examples/evaluate_libero.py @@ -0,0 +1,255 @@ +""" +This script demonstrates how to evaluate a pretrained smolVLA policy on the LIBERO benchmark. +""" + +import collections +import dataclasses +import logging +import math +import pathlib + +import cv2 +import draccus +import imageio +import numpy as np +import torch +from libero.libero import benchmark, get_libero_path +from libero.libero.envs import OffScreenRenderEnv +from tqdm import tqdm + +from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy +from lerobot.policies.pi0.modeling_pi0 import PI0Policy + +LIBERO_DUMMY_ACTION = [0.0] * 6 + [-1.0] +LIBERO_ENV_RESOLUTION = 256 # resolution used to render training data + +@dataclasses.dataclass +class Args: + """ + Evaluation arguments for smolVLA on LIBERO. + """ + + # --- Hugging Face arguments --- + policy_path: str = "lerobot/smolvla_base" + """Path to the pretrained policy on the Hugging Face Hub or local directory.""" + + # --- LIBERO environment-specific parameters --- + task_suite_name: str = "libero_spatial" + """Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90""" + num_steps_wait: int = 10 + """Number of steps to wait for objects to stabilize in sim.""" + num_trials_per_task: int = 50 + """Number of rollouts per task.""" + + # --- Evaluation arguments --- + video_out_path: str = "data/libero/videos" + """Path to save videos.""" + device: str = "cuda" + """Device to use for evaluation.""" + + seed: int = 7 + """Random Seed (for reproducibility)""" + + +@draccus.wrap() +def eval_libero(args: Args) -> None: + # Set random seed + torch.manual_seed(args.seed) + np.random.seed(args.seed) + + # --- Load Policy --- + policy = SmolVLAPolicy.from_pretrained(args.policy_path) + policy.to(args.device) + policy.eval() + + # --- Initialize LIBERO task suite --- + benchmark_dict = benchmark.get_benchmark_dict() + try: + task_suite = benchmark_dict[args.task_suite_name]() + except KeyError: + raise ValueError( + f"Unknown task suite: {args.task_suite_name}. " + f"Available options are: {list(benchmark_dict.keys())}" + ) + num_tasks_in_suite = task_suite.n_tasks + logging.info(f"Task suite: {args.task_suite_name}") + + pathlib.Path(args.video_out_path).mkdir(parents=True, exist_ok=True) + + if args.task_suite_name == "libero_spatial": + max_steps = 220 # longest training demo has 193 steps + elif args.task_suite_name == "libero_object": + max_steps = 280 # longest training demo has 254 steps + elif args.task_suite_name == "libero_goal": + max_steps = 300 # longest training demo has 270 steps + elif args.task_suite_name == "libero_10": + max_steps = 520 # longest training demo has 505 steps + elif args.task_suite_name == "libero_90": + max_steps = 400 # longest training demo has 373 steps + else: + # Fallback for custom task suites + max_steps = 520 + + # --- Evaluation Loop --- + total_episodes, total_successes = 0, 0 + for task_id in tqdm(range(num_tasks_in_suite), desc="Tasks"): + # Get task + task = task_suite.get_task(task_id) + + # Get default LIBERO initial states + initial_states = task_suite.get_task_init_states(task_id) + + # Initialize LIBERO environment and task description + env, task_description = _get_libero_env(task, LIBERO_ENV_RESOLUTION, args.seed) + + # Start episodes + task_episodes, task_successes = 0, 0 + for episode_idx in tqdm( + range(min(args.num_trials_per_task, len(initial_states))), + desc=f"Task {task_id}: {task.language}", + leave=False, + ): + logging.info(f"\nTask: {task_description}") + + # Reset environment and policy + env.reset() + policy.reset() + + # Set initial states + obs = env.set_init_state(initial_states[episode_idx]) + + # IMPORTANT: Do nothing for the first few timesteps because the simulator drops objects + # and we need to wait for them to fall + for _ in range(args.num_steps_wait): + obs, _, _, _ = env.step(LIBERO_DUMMY_ACTION) + + # Setup + t = 0 + frames = [] + done = False + + # Add initial frame + agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1]) + # frames.append(agentview_image) + # import ipdb; ipdb.set_trace() + logging.info(f"Starting episode {task_episodes+1}...") + while t < max_steps: + try: + # Get preprocessed image + # IMPORTANT: rotate 180 degrees to match train preprocessing + wrist_img = np.ascontiguousarray(obs["robot0_eye_in_hand_image"][::-1, ::-1]) + agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1]) + frames.append(agentview_image) + + # Prepare observations dict + state = np.concatenate( + ( + obs["robot0_eef_pos"], + _quat2axisangle(obs["robot0_eef_quat"]), + obs["robot0_gripper_qpos"], + ) + ) + observation = { + "observation.images.image": torch.from_numpy(agentview_image / 255.0) + .permute(2, 0, 1) + .to(torch.float32) + .to(args.device).unsqueeze(0), + "observation.images.image2": torch.from_numpy(wrist_img / 255.0) + .permute(2, 0, 1) + .to(torch.float32) + .to(args.device).unsqueeze(0), + "observation.state": torch.from_numpy(state).to(torch.float32).to(args.device).unsqueeze(0), + "task": task_description, + } + + # Query model to get action + with torch.inference_mode(): + action_tensor = policy.select_action(observation) + action = action_tensor.cpu().numpy()[0] + action[-1] = 1 - action[-1] + + # Execute action in environment + obs, _, done, _ = env.step(action) + if done: + task_successes += 1 + total_successes += 1 + break + t += 1 + + except Exception as e: + logging.error(f"Caught exception: {e}") + break + + task_episodes += 1 + total_episodes += 1 + + # Save a replay video of the episode + suffix = "success" if done else "failure" + task_segment = task_description.replace(" ", "_").replace("/", "_") + video_path = ( + pathlib.Path(args.video_out_path) / f"rollout_task_{task_id}_episode_{episode_idx}_{task_segment}_{suffix}.mp4" + ) + fps = 30 + writer = imageio.get_writer(video_path, fps=fps) + + for image in frames: + writer.append_data(image) + writer.close() + logging.info(f"Saved video to {video_path}") + + # Log current results + logging.info(f"Success: {done}") + if total_episodes > 0: + logging.info(f"# episodes completed so far: {total_episodes}") + logging.info(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)") + + # Log final results for the task + if task_episodes > 0: + logging.info(f"Task {task_id} success rate: {float(task_successes) / float(task_episodes):.2f}") + if total_episodes > 0: + logging.info(f"Cumulative success rate: {float(total_successes) / float(total_episodes):.2f}") + + logging.info("--- Evaluation finished ---") + if total_episodes > 0: + logging.info(f"Total success rate: {float(total_successes) / float(total_episodes):.2f}") + logging.info(f"Total episodes: {total_episodes}") + logging.info(f"Total successes: {total_successes}") + cv2.destroyAllWindows() + + +def _get_libero_env(task, resolution, seed): + """Initializes and returns the LIBERO environment, along with the task description.""" + task_description = task.language + task_bddl_file = pathlib.Path(get_libero_path("bddl_files")) / task.problem_folder / task.bddl_file + env_args = { + "bddl_file_name": str(task_bddl_file), + "camera_heights": resolution, + "camera_widths": resolution, + } + env = OffScreenRenderEnv(**env_args) + env.seed(seed) # IMPORTANT: seed seems to affect object positions even when using fixed initial state + return env, task_description + + +def _quat2axisangle(quat): + """ + Copied from robosuite: + https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55 + """ + # clip quaternion + if quat[3] > 1.0: + quat[3] = 1.0 + elif quat[3] < -1.0: + quat[3] = -1.0 + + den = np.sqrt(1.0 - quat[3] * quat[3]) + if math.isclose(den, 0.0): + # This is (close to) a zero degree rotation, immediately return + return np.zeros(3) + + return (quat[:3] * 2.0 * math.acos(quat[3])) / den + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + eval_libero() \ No newline at end of file diff --git a/examples/requirements.in b/examples/requirements.in new file mode 100644 index 000000000..25664608a --- /dev/null +++ b/examples/requirements.in @@ -0,0 +1,8 @@ +imageio[ffmpeg] +numpy==1.22.4 +tqdm +tyro +PyYaml +opencv-python==4.6.0.66 +robosuite==1.4.1 +matplotlib==3.5.3 \ No newline at end of file diff --git a/examples/script2.py b/examples/script2.py new file mode 100644 index 000000000..cbd4da913 --- /dev/null +++ b/examples/script2.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +import os +import pyarrow.parquet as pq +import tempfile +import shutil + +FILES_TO_PATCH = [ + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet", +] + +# Column renaming map +rename_map = { + "wrist_image": "observation.images.image2", + "actions": "action", +} + +# Hugging Face metadata +HF_METADATA = { + b"huggingface": b'{"info": {"features": {' + b'"observation.images.image": {"_type": "Image"}, ' + b'"observation.images.image2": {"_type": "Image"}, ' + b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, ' + b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, ' + b'"timestamp": {"dtype": "float32", "_type": "Value"}, ' + b'"frame_index": {"dtype": "int64", "_type": "Value"}, ' + b'"episode_index": {"dtype": "int64", "_type": "Value"}, ' + b'"index": {"dtype": "int64", "_type": "Value"}, ' + b'"task_index": {"dtype": "int64", "_type": "Value"}}}}' +} + +def patch_parquet(parquet_path, hf_metadata, rename_map): + try: + table = pq.read_table(parquet_path) + + # Apply column renames if needed + if rename_map: + schema = table.schema + new_names = [rename_map.get(name, name) for name in schema.names] + table = table.rename_columns(new_names) + + # Merge schema metadata + new_meta = dict(table.schema.metadata or {}) + new_meta.update(hf_metadata) + + # Replace metadata + table = table.replace_schema_metadata(new_meta) + + # Write via temp file + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") + os.close(tmp_fd) + pq.write_table(table, tmp_path) + + shutil.move(tmp_path, parquet_path) + print(f"āœ… Patched: {parquet_path}") + return True + except Exception as e: + print(f"āŒ Failed on {parquet_path}: {e}") + return False + + +if __name__ == "__main__": + for fpath in FILES_TO_PATCH: + if os.path.exists(fpath): + patch_parquet(fpath, HF_METADATA, rename_map) + else: + print(f"āš ļø File not found: {fpath}") diff --git a/examples/script3.py b/examples/script3.py new file mode 100644 index 000000000..7b4d7957a --- /dev/null +++ b/examples/script3.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import os +import pyarrow.parquet as pq +import tempfile +import shutil + +# Root directory containing all parquet files +ROOT_DIR = "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data" + +# Column renaming map (normalize schema to what training expects) +rename_map = { + "state": "observation.state", +} + +# Hugging Face metadata (aligned with expected feature names) +HF_METADATA = { + b"huggingface": b'{"info": {"features": {' + b'"observation.images.image": {"_type": "Image"}, ' + b'"observation.images.image2": {"_type": "Image"}, ' + b'"observation.state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, ' + b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, ' + b'"timestamp": {"dtype": "float32", "_type": "Value"}, ' + b'"frame_index": {"dtype": "int64", "_type": "Value"}, ' + b'"episode_index": {"dtype": "int64", "_type": "Value"}, ' + b'"index": {"dtype": "int64", "_type": "Value"}, ' + b'"task_index": {"dtype": "int64", "_type": "Value"}}}}' +} + +def patch_parquet(parquet_path, hf_metadata, rename_map): + try: + # Read the parquet table + table = pq.read_table(parquet_path) + + # Apply renames if necessary + if rename_map: + new_names = [rename_map.get(name, name) for name in table.schema.names] + if new_names != table.schema.names: + table = table.rename_columns(new_names) + + # Update metadata + new_meta = dict(table.schema.metadata or {}) + new_meta.update(hf_metadata) + table = table.replace_schema_metadata(new_meta) + + # Write to temp file then atomically move back + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") + os.close(tmp_fd) + pq.write_table(table, tmp_path) + shutil.move(tmp_path, parquet_path) + + # Debug print + print(f"āœ… Patched: {parquet_path}") + print(" Columns:", table.schema.names) + return True + except Exception as e: + print(f"āŒ Failed on {parquet_path}: {e}") + return False + +if __name__ == "__main__": + for dirpath, _, filenames in os.walk(ROOT_DIR): + for fname in filenames: + if fname.endswith(".parquet"): + fpath = os.path.join(dirpath, fname) + patch_parquet(fpath, HF_METADATA, rename_map) diff --git a/examples/script4.py b/examples/script4.py new file mode 100644 index 000000000..2eed60a94 --- /dev/null +++ b/examples/script4.py @@ -0,0 +1,3 @@ +from huggingface_hub import HfApi +hub_api = HfApi() +hub_api.create_tag("HuggingFaceVLA/libero", tag="v2.1", repo_type="dataset") diff --git a/log_text.txt b/log_text.txt new file mode 100644 index 000000000..6676df0eb --- /dev/null +++ b/log_text.txt @@ -0,0 +1,1765 @@ + self.vlm_with_expert = SmolVLMWithExpertModel( + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/smolvlm_with_expert.py", line 88, in __init__ + self.processor = AutoProcessor.from_pretrained(model_id) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/models/auto/processing +_auto.py", line 300, in from_pretrained + config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/processing_utils.py", +line 944, in get_processor_dict + resolved_raw_chat_template_file = cached_file( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 32 +1, in cached_file + file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 47 +8, in cached_files + hf_hub_download( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p +y", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 1010, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 1073, in _hf_hub_download_to_cache_dir + (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_err +or( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 1546, in _get_metadata_or_catch_error + metadata = get_hf_file_metadata( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p +y", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 1463, in get_hf_file_metadata + r = _request_wrapper( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 286, in _request_wrapper + response = _request_wrapper( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 309, in _request_wrapper + response = http_backoff(method=method, url=url, **params, retry_on_exceptions=(), retry_on_status_codes=(429,)) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li +ne 310, in http_backoff + response = session.request(method=method, url=url, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 589, in + request + resp = self.send(prep, **send_kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 703, in + send + r = adapter.send(request, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li +ne 96, in send + return super().send(request, *args, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/adapters.py", line 644, in + send + resp = conn.urlopen( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 78 +7, in urlopen + response = self._make_request( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 53 +4, in _make_request + response = conn.getresponse() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connection.py", line 565, i +n getresponse + httplib_response = super().getresponse() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 1375, in getresponse + response.begin() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 318, in begin + version, status, reason = self._read_status() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 279, in _read_status + line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/socket.py", line 717, in readinto + return self._sock.recv_into(b) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1307, in recv_into + return self.read(nbytes, buffer) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1163, in read + return self._sslobj.read(len, buffer) +KeyboardInterrupt +clea +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:50:52 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:50:52 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:50:52 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:50:52 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:50:52 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:50:52 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:50:52 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 67057.8 +5it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 70/70 [00:00<00:00, 5343.9 +4it/s] +INFO 2025-09-09 15:50:57 ts/train.py:163 Creating policy +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 47393.2 +7it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 3797.4 +7it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 44384.1 +7it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 6533.1 +8it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 15:51:30 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 15:51:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 15:51:30 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 15:51:30 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 15:51:30 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 15:51:30 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 15:51:30 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 15:51:30 ts/train.py:187 num_total_params=399268924 (399M) +INFO 2025-09-09 15:51:30 ts/train.py:225 Start offline training on a fixed dataset +> /home/jade_choghari/lerobot/src/lerobot/scripts/train.py(230)train() +-> train_tracker.dataloading_s = time.perf_counter() - start_time +(Pdb) batch.keys() +dict_keys(['image', 'wrist_image', 'state', 'actions', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_i +ndex', 'task']) +(Pdb) policy.config.input_features +{'image': PolicyFeature(type=, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type +=, shape=(3, 256, 256))} +(Pdb) quit() +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train + train_tracker.dataloading_s = time.perf_counter() - start_time + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train + train_tracker.dataloading_s = time.perf_counter() - start_time + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch + return self.dispatch_line(frame) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line + if self.quitting: raise BdbQuit +bdb.BdbQuit +clear +^[[A(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:53:49 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:53:49 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:53:49 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:53:49 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:53:49 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:53:49 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:53:49 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 34701.4 +4it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 70/70 [00:00<00:00, 5495.3 +7it/s] +INFO 2025-09-09 15:53:55 ts/train.py:163 Creating policy +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 41943.0 +4it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 5500.7 +3it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 2361.6 +6it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 5041.2 +3it/s] +Reducing the number of VLM layers to 16 ... +> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy() +-> assert isinstance(policy, nn.Module) +(Pdb) features +{'image': PolicyFeature(type=, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type +=, shape=(3, 256, 256)), 'actions': PolicyFeature(type=, + shape=(7,))} +(Pdb) ds_meta.features +{'image': {'dtype': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'wrist_image': {'dtyp +e': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'state': {'dtype': 'float32', 'shape' +: (8,), 'names': ['state']}, 'actions': {'dtype': 'float32', 'shape': (7,), 'names': ['actions']}, 'timestamp': {'dt +ype': 'float32', 'shape': (1,), 'names': None}, 'frame_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'ep +isode_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'index': {'dtype': 'int64', 'shape': (1,), 'names': +None}, 'task_index': {'dtype': 'int64', 'shape': (1,), 'names': None}} +(Pdb) quit() + +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train + policy = make_policy( + File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy + assert isinstance(policy, nn.Module) + File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy + assert isinstance(policy, nn.Module) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch + return self.dispatch_line(frame) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line + if self.quitting: raise BdbQuit +bdb.BdbQuit +clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:56:35 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:56:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:56:35 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:56:35 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:56:35 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:56:35 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:56:35 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 78132.9 +5it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 70/70 [00:00<00:00, 4716.0 +3it/s] +INFO 2025-09-09 15:56:40 ts/train.py:163 Creating policy +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 5259.3 +2it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 3477.8 +6it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 45343.8 +3it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 5551.6 +9it/s] +Reducing the number of VLM layers to 16 ... +> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy() +-> assert isinstance(policy, nn.Module) +(Pdb) features +{'image': PolicyFeature(type=, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type +=, shape=(3, 256, 256)), 'state': PolicyFeature(type=, sha +pe=(8,)), 'actions': PolicyFeature(type=, shape=(7,))} +(Pdb) quit() +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main + + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train + policy = make_policy( + File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy + # policy = torch.compile(policy, mode="reduce-overhead") + File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy + # policy = torch.compile(policy, mode="reduce-overhead") + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch + return self.dispatch_line(frame) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line + if self.quitting: raise BdbQuit +bdb.BdbQuit +clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:58:35 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:58:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:58:35 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:58:35 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:58:35 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:58:35 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:58:35 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 27666.4 +6it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 70/70 [00:00<00:00, 5305.7 +0it/s] +INFO 2025-09-09 15:58:41 ts/train.py:163 Creating policy +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 44384.1 +7it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 3192.0 +1it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 44620.2 +6it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 42799.0 +2it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 15:59:13 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 15:59:13 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 15:59:13 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 15:59:13 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 15:59:13 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 15:59:13 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 15:59:13 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 15:59:13 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 15:59:13 ts/train.py:225 Start offline training on a fixed dataset +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train + train_tracker, output_dict = update_policy( + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy + loss, output_dict = policy.forward(batch) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 458, in forward + actions = self.prepare_action(batch) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 580, in prepare_action + actions = pad_vector(batch[ACTION], self.config.max_action_dim) +KeyError: 'action' +Exception in thread Thread-3 (_pin_memory_loop): +Traceback (most recent call last): + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 1016, in _bootstrap_inner + self.run() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 953, in run + self._target(*self._args, **self._kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory. +py", line 61, in _pin_memory_loop + do_one_step() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory. +py", line 37, in do_one_step + r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/queues.py", line 122, in get + return _ForkingPickler.loads(res) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/multiprocessing/reductions.py +", line 541, in rebuild_storage_fd + fd = df.detach() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in +detach + with _resource_sharer.get_connection(self._id) as conn: + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in +get_connection + c = Client(address, authkey=process.current_process().authkey) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 508, in Clie +nt + answer_challenge(c, authkey) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 752, in answ +er_challenge + message = connection.recv_bytes(256) # reject large message + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 216, in recv +_bytes + buf = self._recv_bytes(maxlength) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 414, in _rec +v_bytes + buf = self._recv(4) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 379, in _rec +v + chunk = read(handle, remaining) +ConnectionResetError: [Errno 104] Connection reset by peer +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:59:53 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:59:53 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:59:53 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:59:53 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:59:53 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:59:53 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:59:53 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 72147.3 +3it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 70/70 [00:00<00:00, 5076.7 +1it/s] +INFO 2025-09-09 15:59:58 ts/train.py:163 Creating policy +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 6096.3 +7it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 4348.6 +8it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 46091.2 +5it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 3225.1 +5it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 16:00:31 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 16:00:31 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 16:00:31 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 16:00:31 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 16:00:31 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 16:00:31 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 16:00:31 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 16:00:31 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 16:00:31 ts/train.py:225 Start offline training on a fixed dataset +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train + train_tracker, output_dict = update_policy( + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy + loss, output_dict = policy.forward(batch) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 461, in forward + losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 850, in forward + att_2d_masks = make_att_2d_masks(pad_masks, att_masks) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 226, in make_att_2d_mask +s + att_2d_masks = att_2d_masks & pad_2d_masks +RuntimeError: The size of tensor a (199) must match the size of tensor b (181) at non-singleton dimension 2 +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 16:10:03 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 16:10:03 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 16:10:03 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 16:10:03 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 16:10:03 ts/train.py:153 Creating dataset +WARNING 2025-09-09 16:10:03 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 16:10:03 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 54574.89it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 70/70 [00:00<00:00, 7567.63it/s] +INFO 2025-09-09 16:10:09 ts/train.py:163 Creating policy +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 40721.40it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 7516.67it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 3158.36it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 6775.94it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 16:10:41 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 16:10:41 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 16:10:41 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 16:10:41 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 16:10:41 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 16:10:41 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 16:10:41 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 16:10:41 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 16:10:41 ts/train.py:225 Start offline training on a fixed dataset +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train + train_tracker, output_dict = update_policy( + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy + grad_scaler.unscale_(optimizer) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342 +, in unscale_ + optimizer_state["found_inf_per_device"] = self._unscale_grads_( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283 +, in _unscale_grads_ + torch._amp_foreach_non_finite_check_and_unscale_( +RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16' +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 16:12:28 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 16:12:28 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 16:12:28 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 16:12:28 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 16:12:28 ts/train.py:153 Creating dataset +WARNING 2025-09-09 16:12:28 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 16:12:28 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 87666.13it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 70/70 [00:00<00:00, 4223.20it/s] +INFO 2025-09-09 16:12:34 ts/train.py:163 Creating policy +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 43690.67it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 4871.43it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 6512.89it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 43018.50it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 16:13:06 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 16:13:06 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 16:13:06 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 16:13:06 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 16:13:06 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 16:13:06 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 16:13:06 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 16:13:06 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 16:13:06 ts/train.py:225 Start offline training on a fixed dataset +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train + train_tracker, output_dict = update_policy( + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy + grad_scaler.unscale_(optimizer) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342 +, in unscale_ + optimizer_state["found_inf_per_device"] = self._unscale_grads_( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283 +, in _unscale_grads_ + torch._amp_foreach_non_finite_check_and_unscale_( +RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16' +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 16:13:51 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 16:13:51 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 16:13:51 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': False, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 16:13:51 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 16:13:51 ts/train.py:153 Creating dataset +WARNING 2025-09-09 16:13:51 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 16:13:51 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 82981.28it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 70/70 [00:00<00:00, 4687.94it/s] +INFO 2025-09-09 16:13:57 ts/train.py:163 Creating policy +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 21345.06it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 4226.00it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 2966.27it/s] +Fetching 2 files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 2/2 [00:00<00:00, 6497.76it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 16:14:30 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 16:14:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 16:14:30 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 16:14:30 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 16:14:30 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 16:14:30 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 16:14:30 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 16:14:30 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 16:14:30 ts/train.py:225 Start offline training on a fixed dataset +INFO 2025-09-09 16:16:20 ts/train.py:255 step:200 smpl:6K ep:40 epch:0.02 loss:1.244 grdn:2.492 lr:1.0e-05 updt_s:0. +536 data_s:0.007 +INFO 2025-09-09 16:17:56 ts/train.py:255 step:400 smpl:13K ep:79 epch:0.05 loss:0.685 grdn:4.262 lr:3.0e-05 updt_s:0 +.481 data_s:0.000 +INFO 2025-09-09 16:19:33 ts/train.py:255 step:600 smpl:19K ep:119 epch:0.07 loss:0.364 grdn:4.849 lr:5.0e-05 updt_s: +0.482 data_s:0.000 +INFO 2025-09-09 16:21:10 ts/train.py:255 step:800 smpl:26K ep:158 epch:0.09 loss:0.239 grdn:4.024 lr:7.0e-05 updt_s: +0.481 data_s:0.000 +INFO 2025-09-09 16:22:46 ts/train.py:255 step:1K smpl:32K ep:198 epch:0.12 loss:0.197 grdn:3.267 lr:9.0e-05 updt_s:0 +.478 data_s:0.000 +INFO 2025-09-09 16:24:22 ts/train.py:255 step:1K smpl:38K ep:238 epch:0.14 loss:0.173 grdn:2.319 lr:1.0e-04 updt_s:0 +.481 data_s:0.000 +INFO 2025-09-09 16:25:59 ts/train.py:255 step:1K smpl:45K ep:277 epch:0.16 loss:0.153 grdn:1.741 lr:1.0e-04 updt_s:0 +.483 data_s:0.000 +INFO 2025-09-09 16:27:36 ts/train.py:255 step:2K smpl:51K ep:317 epch:0.19 loss:0.135 grdn:1.354 lr:9.9e-05 updt_s:0 +.483 data_s:0.000 +INFO 2025-09-09 16:29:14 ts/train.py:255 step:2K smpl:58K ep:357 epch:0.21 loss:0.126 grdn:1.177 lr:9.9e-05 updt_s:0 +.484 data_s:0.000 + diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py index f5fa727cf..75863d3fc 100644 --- a/src/lerobot/configs/policies.py +++ b/src/lerobot/configs/policies.py @@ -62,6 +62,7 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP, # automatic gradient scaling is used. use_amp: bool = False + gradient_accumulation_steps: int = 1 push_to_hub: bool = True repo_id: str | None = None diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index a869cb920..6509993bb 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -472,7 +472,6 @@ class LeRobotDataset(torch.utils.data.Dataset): episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes] self.stats = aggregate_stats(episodes_stats) - # Load actual data try: if force_cache_sync: raise FileNotFoundError @@ -598,6 +597,7 @@ class LeRobotDataset(torch.utils.data.Dataset): """hf_dataset contains all the observations, states, actions, rewards, etc.""" if self.episodes is None: path = str(self.root / "data") + # added by jade hf_dataset = load_dataset("parquet", data_dir=path, split="train") else: files = [str(self.root / self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes] diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index 078c5351d..daa1de163 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -455,7 +455,8 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea shape = (shape[2], shape[0], shape[1]) elif key == "observation.environment_state": type = FeatureType.ENV - elif key.startswith("observation"): + # changed by jade + elif key.startswith("observation") or key.startswith("state"): type = FeatureType.STATE elif key.startswith("action"): type = FeatureType.ACTION diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index 4b8eeffd1..79461d3a9 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -34,6 +34,7 @@ from lerobot.policies.sac.reward_model.configuration_classifier import RewardCla from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig +from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config def get_policy_class(name: str) -> PreTrainedPolicy: @@ -74,6 +75,10 @@ def get_policy_class(name: str) -> PreTrainedPolicy: from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy return SmolVLAPolicy + elif name == "smolpi0": + from lerobot.policies.smolpi0.modeling_smolpi0 import SMOLPI0Policy + + return SMOLPI0Policy else: raise NotImplementedError(f"Policy with name {name} is not implemented.") @@ -97,6 +102,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig: return SmolVLAConfig(**kwargs) elif policy_type == "reward_classifier": return RewardClassifierConfig(**kwargs) + elif policy_type == "smolpi0": + return SMOLPI0Config(**kwargs) else: raise ValueError(f"Policy type '{policy_type}' is not available.") @@ -170,7 +177,6 @@ def make_policy( policy = policy_cls(**kwargs) policy.to(cfg.device) assert isinstance(policy, nn.Module) - # policy = torch.compile(policy, mode="reduce-overhead") return policy diff --git a/src/lerobot/policies/normalize.py b/src/lerobot/policies/normalize.py index 119055873..043265b1b 100644 --- a/src/lerobot/policies/normalize.py +++ b/src/lerobot/policies/normalize.py @@ -255,6 +255,83 @@ class Unnormalize(nn.Module): return batch +class NormalizePerRobotType(nn.Module): + """Normalizes data (e.g. "observation.image") for more stable and faster convergence during training.""" + + def __init__( + self, + features: dict[str, PolicyFeature], + norm_map: dict[str, NormalizationMode], + stats: dict[str, dict[str, Tensor]] | None = None, + ): + """ + Args: + shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values + are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing + mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape + is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format. + modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values + are their normalization modes among: + - "mean_std": subtract the mean and divide by standard deviation. + - "min_max": map to [-1, 1] range. + stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image") + and values are dictionaries of statistic types and their values (e.g. + `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for + training the model for the first time, these statistics will overwrite the default buffers. If + not provided, as expected for finetuning or evaluation, the default buffers should to be + overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the + dataset is not needed to get the stats, since they are already in the policy state_dict. + """ + super().__init__() + self.features = features + self.norm_map = norm_map + for robot_type in stats.keys(): + stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type]) + for key, buffer in stats_buffers.items(): + setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer) + + # TODO(rcadene): should we remove torch.no_grad? + @torch.no_grad + def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: + batch = dict(batch) # shallow copy avoids mutating the input batch + assert "robot_type" in batch, "robot_type is not in the batch" + robot_types = batch["robot_type"] + + for key, ft in self.features.items(): + if key not in batch: + continue + + norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY) + if norm_mode is NormalizationMode.IDENTITY: + continue + # FIXME(mshukor): make it more efficient + buffers = [ + getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types + ] + if norm_mode is NormalizationMode.MEAN_STD: + mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))],dim=0) + std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))],dim=0) + if batch[key].ndim == 3: + mean = mean.unsqueeze(1) + std = std.unsqueeze(1) + assert not torch.isinf(mean).any(), _no_stats_error_str("mean") + assert not torch.isinf(std).any(), _no_stats_error_str("std") + batch[key] = (batch[key] - mean) / (std + 1e-8) + elif norm_mode is NormalizationMode.MIN_MAX: + min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0) + max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0) + assert not torch.isinf(min).any(), _no_stats_error_str("min") + assert not torch.isinf(max).any(), _no_stats_error_str("max") + if batch[key].ndim == 3: + min = min.unsqueeze(1) + max = max.unsqueeze(1) + # normalize to [0,1] + batch[key] = (batch[key] - min) / (max - min + 1e-8) + # normalize to [-1, 1] + batch[key] = batch[key] * 2 - 1 + else: + raise ValueError(norm_mode) + return batch # TODO (azouitine): We should replace all normalization on the policies with register_buffer normalization # and remove the `Normalize` and `Unnormalize` classes. def _initialize_stats_buffers( @@ -418,3 +495,87 @@ class UnnormalizeBuffer(nn.Module): raise ValueError(norm_mode) return batch + + +class UnnormalizePerRobotType(nn.Module): + """ + Similar to `Normalize` but unnormalizes output data (e.g. `{"action": torch.randn(b,c)}`) in their + original range used by the environment. + """ + + def __init__( + self, + features: dict[str, PolicyFeature], + norm_map: dict[str, NormalizationMode], + stats: dict[str, dict[str, Tensor]] | None = None, + ): + """ + Args: + shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values + are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing + mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape + is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format. + modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values + are their normalization modes among: + - "mean_std": subtract the mean and divide by standard deviation. + - "min_max": map to [-1, 1] range. + stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image") + and values are dictionaries of statistic types and their values (e.g. + `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for + training the model for the first time, these statistics will overwrite the default buffers. If + not provided, as expected for finetuning or evaluation, the default buffers should to be + overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the + dataset is not needed to get the stats, since they are already in the policy state_dict. + """ + super().__init__() + self.features = features + self.norm_map = norm_map + self.stats = stats + # `self.buffer_observation_state["mean"]` contains `torch.tensor(state_dim)` + for robot_type in stats.keys(): + stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type]) + for key, buffer in stats_buffers.items(): + setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer) + + # TODO(rcadene): should we remove torch.no_grad? + @torch.no_grad + def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: + batch = dict(batch) # shallow copy avoids mutating the input batch + assert "robot_type" in batch, "robot_type is not in the batch" + robot_types = batch["robot_type"] + + for key, ft in self.features.items(): + if key not in batch: + continue + + norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY) + if norm_mode is NormalizationMode.IDENTITY: + continue + + # buffer = getattr(self, "buffer_" + key.replace(".", "_")) + buffers = [ + getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types + ] + + if norm_mode is NormalizationMode.MEAN_STD: + mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0) + std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0) + assert not torch.isinf(mean).any(), _no_stats_error_str("mean") + assert not torch.isinf(std).any(), _no_stats_error_str("std") + if batch[key].ndim == 3: + mean = mean.unsqueeze(1) + std = std.unsqueeze(1) + batch[key] = batch[key] * std + mean + elif norm_mode is NormalizationMode.MIN_MAX: + min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0) + max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0) + assert not torch.isinf(min).any(), _no_stats_error_str("min") + assert not torch.isinf(max).any(), _no_stats_error_str("max") + if batch[key].ndim == 3: + min = min.unsqueeze(1) + max = max.unsqueeze(1) + batch[key] = (batch[key] + 1) / 2 + batch[key] = batch[key] * (max - min) + min + else: + raise ValueError(norm_mode) + return batch diff --git a/src/lerobot/policies/smolpi0/configuration_smolpi0.py b/src/lerobot/policies/smolpi0/configuration_smolpi0.py new file mode 100644 index 000000000..c3605cd82 --- /dev/null +++ b/src/lerobot/policies/smolpi0/configuration_smolpi0.py @@ -0,0 +1,210 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field + +from lerobot.optim.optimizers import AdamWConfig +from lerobot.optim.schedulers import ( + CosineDecayWithWarmupSchedulerConfig, +) +from lerobot.configs.policies import PreTrainedConfig +from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature + + +@dataclass +class PEFTConfig: + r: int = 4 + lora_alpha: int = 16 + lora_dropout: float = 0.1 + target_modules: str = "q_proj,v_proj" + + +@PreTrainedConfig.register_subclass("smolpi0") +@dataclass +class SMOLPI0Config(PreTrainedConfig): + # Input / output structure. + n_obs_steps: int = 1 + chunk_size: int = 50 + n_action_steps: int = 50 + n_obs_gap: int = 1 + + normalization_mapping: dict[str, NormalizationMode] = field( + default_factory=lambda: { + "VISUAL": NormalizationMode.IDENTITY, + "STATE": NormalizationMode.MEAN_STD, + "ACTION": NormalizationMode.MEAN_STD, + } + ) + + # Shorter state and action vectors will be padded + max_state_dim: int = 32 + max_action_dim: int = 32 + + # Image preprocessing + resize_imgs_with_padding: tuple[int, int] = (512, 512) #(224, 224) + + # Add empty images. Used by pi0_aloha_sim which adds the empty + # left and right wrist cameras in addition to the top camera. + empty_cameras: int = 0 + + # Converts the joint and gripper values from the standard Aloha space to + # the space used by the pi internal runtime which was used to train the base model. + adapt_to_pi_aloha: bool = False + + # Converts joint dimensions to deltas with respect to the current state before passing to the model. + # Gripper dimensions will remain in absolute values. + use_delta_joint_actions_aloha: bool = False + + # Tokenizer + tokenizer_max_length: int = 48 + + # Projector + proj_width: int = 480 + + # Decoding + num_steps: int = 10 + + # Attention utils + use_cache: bool = True + attention_implementation: str = "eager" # or fa2, flex + + # Finetuning settings + freeze_vision_encoder: bool = True + train_expert_only: bool = False + train_state_proj: bool = True + + # Training presets + optimizer_lr: float = 2.5e-5 + optimizer_betas: tuple[float, float] = (0.9, 0.95) + optimizer_eps: float = 1e-8 + optimizer_weight_decay: float = 1e-10 + optimizer_grad_clip_norm: float = 10 + optimizer_lr_vlm: float = 0 + + scheduler_warmup_steps: int = 1_000 + scheduler_decay_steps: int = 30_000 + scheduler_decay_lr: float = 2.5e-6 + + # TODO: Add EMA + vlm_model_name: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + checkpoint_path: str = None + load_vlm_weights: bool = False + + peft_method: str = "" + peft_config: PEFTConfig = PEFTConfig() + peft_target_model: str = "" + + add_image_special_tokens: bool = False + add_prompt_template: bool = False + prefix_prompt_template: str = f"<|im_start|>User: What action should the robot take to" + suffix_prompt_template: str = f"?\nAssistant:" + + attention_mode: str = "self_attn" + + prefix_length: int = -1 # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length + + past_obs_keys: str = f"image" + + add_local_special_image_tokens: bool = False + + reverse_images_order: bool = False + + state_to_prefix: bool = False + + pad_language_to: str = "longest" # "max_length" + + num_expert_layers: int = -1 + num_vlm_layers: int = -1 + + causal_action_attention_mask: bool = False + + self_attn_every_n_layers: int = -1 + + expert_width_multiplier: float = 0.5 + + robot_type: str = "" + + self_attn_only_actions: bool = False + + causal_attention_on_history: bool = False + + predict_relative_actions: bool = False + relative_actions_mode: str = "first" + + shuffle_camera_positions: bool = False + vlm_img_size: int = -1 + + regression_loss: bool = False + + def __post_init__(self): + super().__post_init__() + if self.vlm_img_size > 0: + self.resize_imgs_with_padding = (self.vlm_img_size, self.vlm_img_size) + """Input validation (not exhaustive).""" + if self.n_action_steps > self.chunk_size: + raise ValueError( + f"The chunk size is the upper bound for the number of action steps per model invocation. Got " + f"{self.n_action_steps} for `n_action_steps` and {self.chunk_size} for `chunk_size`." + ) + # if self.n_obs_steps != 1: + # raise ValueError( + # f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`" + # ) + + if self.use_delta_joint_actions_aloha: + raise NotImplementedError( + "`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot." + ) + + def validate_features(self) -> None: + # TODO: implement value error + # if not self.image_features and not self.env_state_feature: + # raise ValueError("You must provide at least one image or the environment state among the inputs.") + + for i in range(self.empty_cameras): + key = f"observation.images.empty_camera_{i}" + empty_camera = PolicyFeature( + type=FeatureType.VISUAL, + shape=(3, 480, 640), + ) + self.input_features[key] = empty_camera + + def get_optimizer_preset(self) -> AdamWConfig: + return AdamWConfig( + lr=self.optimizer_lr, + betas=self.optimizer_betas, + eps=self.optimizer_eps, + weight_decay=self.optimizer_weight_decay, + grad_clip_norm=self.optimizer_grad_clip_norm, + ) + + def get_scheduler_preset(self): + return CosineDecayWithWarmupSchedulerConfig( + peak_lr=self.optimizer_lr, + decay_lr=self.scheduler_decay_lr, + num_warmup_steps=self.scheduler_warmup_steps, + num_decay_steps=self.scheduler_decay_steps, + ) + + @property + def observation_delta_indices(self) -> list: # FIXME(mshukor): support spacing between observations + return [-k for k in range(0, self.n_obs_steps * self.n_obs_gap, self.n_obs_gap)][::-1] + + @property + def action_delta_indices(self) -> list: + return list(range(self.chunk_size)) + + @property + def reward_delta_indices(self) -> None: + return None diff --git a/src/lerobot/policies/smolpi0/flex_attention.py b/src/lerobot/policies/smolpi0/flex_attention.py new file mode 100644 index 000000000..13950f743 --- /dev/null +++ b/src/lerobot/policies/smolpi0/flex_attention.py @@ -0,0 +1,145 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn.functional as F # noqa: N812 +from packaging.version import Version + +if Version(torch.__version__) > Version("2.5.0"): + # Ffex attention is only available from torch 2.5 onwards + from torch.nn.attention.flex_attention import ( + _mask_mod_signature, + _round_up_to_multiple, + create_block_mask, + create_mask, + flex_attention, + ) + + +@torch.compile(dynamic=False) +def flex_attention_forward( + attention_mask: torch.Tensor, + batch_size: int, + head_dim: int, + query_states: torch.Tensor, + key_states: torch.Tensor, + value_states: torch.Tensor, + scaling=None, + num_att_heads: int = 8, + num_key_value_heads: int = 1, +): + """ + This is defined out of classes to make compile happy. + """ + + original_dtype = query_states.dtype + num_key_value_groups = num_att_heads // num_key_value_heads + key_states = key_states[:, :, :, None, :] + key_states = key_states.expand( + batch_size, key_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim + ) + key_states = key_states.reshape( + batch_size, key_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim + ) + + value_states = value_states[:, :, :, None, :] + value_states = value_states.expand( + batch_size, value_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim + ) + value_states = value_states.reshape( + batch_size, value_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim + ) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + # query_states = query_states.to(torch.float32) + # key_states = key_states.to(torch.float32) + # value_states = value_states.to(torch.float32) + + causal_mask = attention_mask + if causal_mask is not None: + causal_mask = causal_mask[:, None, :, : key_states.shape[2]] + + if causal_mask.shape[1] == 1 and query_states.shape[1] > 1: + causal_mask = causal_mask.expand(-1, query_states.shape[1], -1, -1) + + def precomputed_mask_factory(precomputed_mask: torch.Tensor) -> _mask_mod_signature: + def mask_mod(b, h, q_idx, kv_idx): + # Danger zone: if b,h,q_idx,kv_idx exceed the shape, device-side assert occurs. + return precomputed_mask[b][h][q_idx][kv_idx] + + return mask_mod + + b_mask, h_mask, q_len, kv_len = causal_mask.shape # The shape of your mask + + block_size = 128 # limitation of flex attention + q_len_rounded = _round_up_to_multiple(q_len, block_size) + kv_len_rounded = _round_up_to_multiple(kv_len, block_size) + + # *CRITICAL* we do need to expand here, else we get a CUDA index error + + pad_q = q_len_rounded - q_len + pad_k = kv_len_rounded - kv_len + if pad_q > 0 or pad_k > 0: + padded_causal_mask = F.pad(causal_mask, (0, pad_k, 0, pad_q), value=0.0) + else: + padded_causal_mask = causal_mask + mask_mod_fn_orig = precomputed_mask_factory(padded_causal_mask) + + mask_4d = create_mask( + mod_fn=mask_mod_fn_orig, + B=b_mask, + H=h_mask, + Q_LEN=q_len_rounded, + KV_LEN=kv_len_rounded, + device=causal_mask.device, + ) + + mask_mod_fn_padded = precomputed_mask_factory(mask_4d) + # FIXME(mshukor): compile mask torch.compile(create_block_mask) + create_block_mask_compiled = torch.compile(create_block_mask) + block_mask = create_block_mask_compiled( + mask_mod=mask_mod_fn_padded, + B=b_mask, + H=None, # + Q_LEN=q_len_rounded, + KV_LEN=kv_len_rounded, + BLOCK_SIZE=block_size, + device=causal_mask.device, + _compile=False, + ) + padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0 else query_states + padded_key_states = F.pad(key_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else key_states + padded_value_states = F.pad(value_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else value_states + # mask is applied inside the kernel, ideally more efficiently than score_mod. + attn_output, attention_weights = flex_attention( + padded_query_states, + padded_key_states, + padded_value_states, + block_mask=block_mask, + enable_gqa=True, # because we shaped query/key states for GQA + scale=head_dim**-0.5 if scaling is None else scaling, + return_lse=True, + ) + + attn_output = attn_output.to(dtype=original_dtype) + attn_output = attn_output.transpose(1, 2).contiguous() # [B, Q_LEN, H, head_dim] + attn_output = attn_output.reshape( + batch_size, + -1, + attn_output.shape[2] * attn_output.shape[3], # merges [H, head_dim] + ) + return attn_output[:, :-pad_k, :] if pad_k > 0 else attn_output diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py new file mode 100644 index 000000000..fa2d3d5a7 --- /dev/null +++ b/src/lerobot/policies/smolpi0/modeling_smolpi0.py @@ -0,0 +1,1021 @@ +#!/usr/bin/env python + +# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Ļ€0: A Vision-Language-Action Flow Model for General Robot Control + +[Paper](https://www.physicalintelligence.company/download/pi0.pdf) +[Jax code](https://github.com/Physical-Intelligence/openpi) + +Designed by Physical Intelligence. Ported from Jax by Hugging Face. + +Install pi0 extra dependencies: +```bash +pip install -e ".[pi0]" +``` + +Example of finetuning the pi0 pretrained model (`pi0_base` in `openpi`): +```bash +python lerobot/scripts/train.py \ +--policy.path=lerobot/pi0 \ +--dataset.repo_id=danaaubakirova/koch_test +``` + +Example of finetuning the pi0 neural network with PaliGemma and expert Gemma +pretrained with VLM default parameters before pi0 finetuning: +```bash +python lerobot/scripts/train.py \ +--policy.type=pi0 \ +--dataset.repo_id=danaaubakirova/koch_test +``` + +Example of using the pi0 pretrained model outside LeRobot training framework: +```python +policy = Pi0Policy.from_pretrained("lerobot/pi0") +``` + +""" + +import math +from collections import deque + +import torch +import torch.nn.functional as F # noqa: N812 +from torch import Tensor, nn +from transformers import AutoProcessor + +from lerobot.constants import ACTION, OBS_STATE +from lerobot.policies.normalize import ( + Normalize, + NormalizePerRobotType, + Unnormalize, + UnnormalizePerRobotType, +) +from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config +from lerobot.policies.smolpi0.smolvlm_with_expert import ( + SmolVLMWithExpertModel +) +from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.utils.utils import get_safe_dtype +OBS_IMAGE = "observation.image" +OBS_IMAGES = "observation.images" +ACTION = "action" +OBS_IMAGE_2 = "observation.image2" +OBS_IMAGE_3 = "observation.image3" +OBS_IMAGE_4 = "observation.image4" +TASK = "task" +ROBOT = "robot_type" +IMAGES_ORDER = { + OBS_IMAGE: 0, + OBS_IMAGE_2: 1, + OBS_IMAGE_3: 2, + OBS_IMAGE_4: 3, +} +from lerobot.policies.utils import ( + populate_queues, +) +import random +def create_sinusoidal_pos_embedding( + time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu" +) -> Tensor: + """Computes sine-cosine positional embedding vectors for scalar positions.""" + if dimension % 2 != 0: + raise ValueError(f"dimension ({dimension}) must be divisible by 2") + + if time.ndim != 1: + raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.") + + dtype = get_safe_dtype(torch.float64, device.type) + fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device) + period = min_period * (max_period / min_period) ** fraction + + # Compute the outer product + scaling_factor = 1.0 / period * 2 * math.pi + sin_input = scaling_factor[None, :] * time[:, None] + pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) + return pos_emb + + +def sample_beta(alpha, beta, bsize, device): + gamma1 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / alpha) + gamma2 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / beta) + return gamma1 / (gamma1 + gamma2) + + +def make_att_2d_masks(pad_masks, att_masks): + """Copied from big_vision. + + Tokens can attend to valid inputs tokens which have a cumulative mask_ar + smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to + setup several types of attention, for example: + + [[1 1 1 1 1 1]]: pure causal attention. + + [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between + themselves and the last 3 tokens have a causal attention. The first + entry could also be a 1 without changing behaviour. + + [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a + block can attend all previous blocks and all tokens on the same block. + + Args: + input_mask: bool[B, N] true if its part of the input, false if padding. + mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on + it and 0 where it shares the same attention mask as the previous token. + """ + if att_masks.ndim != 2: + raise ValueError(att_masks.ndim) + if pad_masks.ndim != 2: + raise ValueError(pad_masks.ndim) + + cumsum = torch.cumsum(att_masks, dim=1) + att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None] + pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None] + att_2d_masks = att_2d_masks & pad_2d_masks + return att_2d_masks + + +def resize_with_pad(img, width, height, pad_value=-1): + # assume no-op when width height fits already + if img.ndim != 4: + raise ValueError(f"(b,c,h,w) expected, but {img.shape}") + + cur_height, cur_width = img.shape[2:] + + ratio = max(cur_width / width, cur_height / height) + resized_height = int(cur_height / ratio) + resized_width = int(cur_width / ratio) + resized_img = F.interpolate( + img, size=(resized_height, resized_width), mode="bilinear", align_corners=False + ) + + pad_height = max(0, int(height - resized_height)) + pad_width = max(0, int(width - resized_width)) + + # pad on left and top of image + padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value) + return padded_img + + +def pad_vector(vector, new_dim): + """Can be (batch_size x sequence_length x features_dimension) + or (batch_size x features_dimension) + """ + if vector.shape[-1] == new_dim: + return vector + shape = list(vector.shape) + current_dim = shape[-1] + shape[-1] = new_dim + new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device) + new_vector[..., :current_dim] = vector + return new_vector + + +def normalize(x, min_val, max_val): + return (x - min_val) / (max_val - min_val) + + +def unnormalize(x, min_val, max_val): + return x * (max_val - min_val) + min_val + + +def safe_arcsin(value): + # This ensures that the input stays within + # [āˆ’1,1] to avoid invalid values for arcsin + return torch.arcsin(torch.clamp(value, -1.0, 1.0)) + + +def aloha_gripper_to_angular(value): + # Aloha transforms the gripper positions into a linear space. The following code + # reverses this transformation to be consistent with pi0 which is pretrained in + # angular space. + # + # These values are coming from the Aloha code: + # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED + value = unnormalize(value, min_val=0.01844, max_val=0.05800) + + # This is the inverse of the angular to linear transformation inside the Interbotix code. + def linear_to_radian(linear_position, arm_length, horn_radius): + value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position) + return safe_arcsin(value) + + # The constants are taken from the Interbotix code. + value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022) + + # Normalize to [0, 1]. + # The values 0.4 and 1.5 were measured on an actual Trossen robot. + return normalize(value, min_val=0.4, max_val=1.5) + + +def aloha_gripper_from_angular(value): + # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha. + # Note that the units are still angular but the range is different. + + # The values 0.4 and 1.5 were measured on an actual Trossen robot. + value = unnormalize(value, min_val=0.4, max_val=1.5) + + # These values are coming from the Aloha code: + # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE + return normalize(value, min_val=-0.6213, max_val=1.4910) + + +def aloha_gripper_from_angular_inv(value): + # Directly inverts the gripper_from_angular function. + value = unnormalize(value, min_val=-0.6213, max_val=1.4910) + return normalize(value, min_val=0.4, max_val=1.5) + +class SMOLPI0Policy(PreTrainedPolicy): + """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot.""" + + config_class = SMOLPI0Config + name = "smolpi0" + + def __init__( + self, + config: SMOLPI0Config, + dataset_stats: dict[str, dict[str, Tensor]] | None = None, + ): + """ + Args: + config: Policy configuration class instance or None, in which case the default instantiation of + the configuration class is used. + dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected + that they will be passed with a call to `load_state_dict` before the policy is used. + """ + + super().__init__(config) + config.validate_features() + self.config = config + self.normalize_per_robot_type = getattr( + config, "normalize_per_robot_type", False + ) # FIXME(mshukor): assert in case of single dataset + if self.normalize_per_robot_type: + if not dataset_stats: + dataset_stats[config.robot_type] = {} + self.normalize_inputs = NormalizePerRobotType( + config.input_features, config.normalization_mapping, dataset_stats + ) + self.normalize_targets = NormalizePerRobotType( + config.output_features, config.normalization_mapping, dataset_stats + ) + self.unnormalize_outputs = UnnormalizePerRobotType( + config.output_features, config.normalization_mapping, dataset_stats + ) + else: + self.normalize_inputs = Normalize( + config.input_features, config.normalization_mapping, dataset_stats + ) + self.normalize_targets = Normalize( + config.output_features, config.normalization_mapping, dataset_stats + ) + self.unnormalize_outputs = Unnormalize( + config.output_features, config.normalization_mapping, dataset_stats + ) + + self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer + self.model = VLAFlowMatching(config) + self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split(",") + self.include_past_images = config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",") + self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1 + self.reset() + + def reset(self): + """This should be called whenever the environment is reset.""" + # self._action_queue = deque([], maxlen=self.config.n_action_steps) + self._queues = { + ACTION: deque(maxlen=self.config.n_action_steps), + } + if self.config.n_obs_steps > 1: + for k in self.config.input_features: + if any([past_obs_key in k for past_obs_key in self.config.past_obs_keys.split(",")]): + self._queues[k] = deque(maxlen=self.config.n_obs_steps) + + def get_optim_params(self) -> dict: + if self.config.optimizer_lr_vlm > 0 and self.config.optimizer_lr_vlm != self.config.optimizer_lr: + params = [ + { + "params": [ + p + for n, p in self.named_parameters() + if not ".vlm." in n and p.requires_grad + ] + }, + { + "params": [ + p + for n, p in self.named_parameters() + if ".vlm." in n and p.requires_grad + ], + "lr": self.config.optimizer_lr_vlm, + }, + ] + return params + + else: + return self.parameters() + + + def merge_peft_model_weights(self) -> None: + if "lora" in self.config.peft_method: + self.model.vlm_with_expert.merge_lora_weights() + + @torch.no_grad + def select_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: + """Select a single action given environment observations. + + This method wraps `select_actions` in order to return one action at a time for execution in the + environment. It works by managing the actions in a queue and only calling `select_actions` when the + queue is empty. + """ + self.eval() + + if self.config.adapt_to_pi_aloha: + batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) + + batch = self.normalize_inputs(batch) + + images, img_masks = self.prepare_images(batch) + state = self.prepare_state(batch) + lang_tokens, lang_masks = self.prepare_language(batch) + + actions = self.model.sample_actions( + images, img_masks, lang_tokens, lang_masks, state, noise=noise + ) + # Unpad actions + original_action_dim = self.config.action_feature.shape[0] + actions = actions[:, :, :original_action_dim] + + actions = self.unnormalize_outputs({"action": actions, "robot_type": batch["robot_type"]})["action"] + + if self.config.adapt_to_pi_aloha: + actions = self._pi_aloha_encode_actions(actions) + + return actions + + @torch.no_grad + def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: + """Select a single action given environment observations. + + This method wraps `select_actions` in order to return one action at a time for execution in the + environment. It works by managing the actions in a queue and only calling `select_actions` when the + queue is empty. + """ + self.eval() + + if self.config.adapt_to_pi_aloha: + batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) + + batch = self.normalize_inputs(batch) + + self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) + # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by + # querying the policy. + if len(self._queues[ACTION]) == 0: + for k in batch: + if k in self._queues: + batch[k] = torch.stack(list(self._queues[k]), dim=1) + images, img_masks = self.prepare_images(batch) + state = self.prepare_state(batch) + lang_tokens, lang_masks = self.prepare_language(batch) + actions = self.model.sample_actions( + images, img_masks, lang_tokens, lang_masks, state, noise=noise + ) + if self.config.predict_relative_actions and actions.ndim == 3: + # If the model predicts relative actions, we need to unpad the actions + # and then convert them to absolute actions. + if self.config.relative_actions_mode == "first": + actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :1]), dim=1) + elif self.config.relative_actions_mode == "state": + actions = actions + state.unsqueeze(1) + else: + actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :-1]), dim=1) + # Unpad actions + original_action_dim = self.config.action_feature.shape[0] + actions = actions[:, :, :original_action_dim] + + actions = self.unnormalize_outputs({"action": actions})["action"] + + if self.config.adapt_to_pi_aloha: + actions = self._pi_aloha_encode_actions(actions) + + # `self.model.forward` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue + # effectively has shape (n_action_steps, batch_size, *), hence the transpose. + self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps]) + return self._queues[ACTION].popleft() + + def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]: + """Do a full training forward pass to compute the loss""" + if self.config.adapt_to_pi_aloha: + batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) + batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION]) + batch = self.normalize_inputs(batch) + batch = self.normalize_targets(batch) + images, img_masks = self.prepare_images( + batch + ) # FIXME(mshukor): adapte it to take into account already padded images in the batch + state = self.prepare_state(batch) + lang_tokens, lang_masks = self.prepare_language(batch) + actions = self.prepare_action(batch, state=state) + actions_is_pad = batch.get("actions_id_pad") + loss_dict = {} + losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) + loss_dict["losses_after_forward"] = losses.mean().clone() + + if actions_is_pad is not None: + in_episode_bound = ~actions_is_pad + losses = losses * in_episode_bound.unsqueeze(-1) + loss_dict["losses_after_in_ep_bound"] = losses.mean().clone() + + # Remove padding + losses = losses[:, :, : self.config.max_action_dim] + loss_dict["losses_after_rm_padding"] = losses.mean().clone() + + # For backward pass + loss = losses.mean() + # For backward pass + loss_dict["loss"] = loss + # # For logging + # loss_dict["l2_loss"] = loss.item() # remove for torch compile + return loss_dict + + def prepare_images(self, batch): + """Apply Pi0 preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and + convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP. + """ + images = [] + img_masks = [] + present_img_keys = [key for key in self.config.image_features if key in batch] + missing_img_keys = [key for key in self.config.image_features if key not in batch] + + present_img_keys = sorted(present_img_keys, key=lambda k: IMAGES_ORDER.get(k, float("inf")), reverse=self.config.reverse_images_order) + if self.config.shuffle_camera_positions and ACTION in batch: # only during training + present_img_keys = random.sample(present_img_keys, len(present_img_keys)) + if len(present_img_keys) == 0: + raise ValueError( + f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})" + ) + for i in range(self.num_past_images): + # Preprocess image features present in the batch + for key in present_img_keys: + img = batch[key][:, i, :, :, :] if batch[key].ndim == 5 else batch[key] + if self.config.resize_imgs_with_padding is not None: + img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0) + + # Normalize from range [0,1] to [-1,1] as expacted by siglip + img = img * 2.0 - 1.0 + + bsize = img.shape[0] + device = img.device + if f"{key}_padding_mask" in batch: + mask = batch[f"{key}_padding_mask"].bool() + else: + mask = torch.ones(bsize, dtype=torch.bool, device=device) + images.append(img) + img_masks.append(mask) + + # Create image features not present in the batch + # as fully 0 padded images. + for num_empty_cameras in range(len(missing_img_keys)): + if num_empty_cameras >= self.config.empty_cameras: + break + img = torch.ones_like(img) * -1 + mask = torch.zeros_like(mask) + images.append(img) + img_masks.append(mask) + return images, img_masks + + def prepare_language(self, batch) -> tuple[Tensor, Tensor]: + """Tokenize the text input""" + device = batch[OBS_STATE].device + tasks = batch["task"] + if len(tasks) == 1: + tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])] + + if self.config.add_prompt_template: + tasks = [f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}" for task in tasks] + else: + tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks] + tokenized_prompt = self.language_tokenizer.__call__( + tasks, + padding=self.config.pad_language_to, + padding_side="right", + max_length=self.config.tokenizer_max_length, + return_tensors="pt", + truncation=True, # FIXME(mshukor) + ) + + lang_tokens = tokenized_prompt["input_ids"].to(device=device) + lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool) + + return lang_tokens, lang_masks + + def _pi_aloha_decode_state(self, state): + # Flip the joints. + for motor_idx in [1, 2, 8, 9]: + state[:, motor_idx] *= -1 + # Reverse the gripper transformation that is being applied by the Aloha runtime. + for motor_idx in [6, 13]: + state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx]) + return state + + def _pi_aloha_encode_actions(self, actions): + # Flip the joints. + for motor_idx in [1, 2, 8, 9]: + actions[:, :, motor_idx] *= -1 + # Reverse the gripper transformation that is being applied by the Aloha runtime. + for motor_idx in [6, 13]: + actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx]) + return actions + + def _pi_aloha_encode_actions_inv(self, actions): + # Flip the joints again. + for motor_idx in [1, 2, 8, 9]: + actions[:, :, motor_idx] *= -1 + # Reverse the gripper transformation that is being applied by the Aloha runtime. + for motor_idx in [6, 13]: + actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx]) + return actions + + def prepare_state(self, batch): + """Pad state""" + state = batch[OBS_STATE][:, -1, :] if (batch[OBS_STATE].ndim > 2 and not self.include_past_states) else batch[OBS_STATE] # FIXME(mshukor): no state history for now + state = pad_vector(state, self.config.max_state_dim) + return state + + def prepare_action(self, batch, state=None): + """Pad action""" + actions = pad_vector(batch[ACTION], self.config.max_action_dim) + if self.config.predict_relative_actions and actions.ndim == 3: + if self.config.relative_actions_mode == "first": + actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :1]), dim=1) + elif self.config.relative_actions_mode == "state": + assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], "Relative action mode 'state' requires the action and state to have the same dimension." + if state.ndim == 2: + state = state.unsqueeze(1) + actions = actions - state + else: + actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :-1]), dim=1) + return actions + +def pad_tensor(tensor, max_len, pad_value=0): + """ + Efficiently pads a tensor along sequence dimension to match max_len. + + Args: + tensor (torch.Tensor): Shape (B, L, ...) or (B, L). + max_len (int): Fixed sequence length. + pad_value (int/float): Value for padding. + + Returns: + torch.Tensor: Shape (B, max_len, ...) or (B, max_len). + """ + B, L = tensor.shape[:2] + + # Create a padded tensor of max_len and copy the existing values + padded_tensor = torch.full((B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device) + padded_tensor[:, :L] = tensor # Efficient in-place copy + + return padded_tensor + +class VLAFlowMatching(nn.Module): + """ + Ļ€0: A Vision-Language-Action Flow Model for General Robot Control + + [Paper](https://www.physicalintelligence.company/download/pi0.pdf) + [Jax code](https://github.com/Physical-Intelligence/openpi) + + Designed by Physical Intelligence. Ported from Jax by Hugging Face. + ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” + │ actions │ + │ ā–² │ + │ ā”Œā”“ā”€ā”€ā”€ā”€ā”€ā” │ + │ kv cache │Gemma │ │ + │ ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā–ŗā”‚Expert│ │ + │ │ │ │ │ + │ ā”Œā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” │x 10 │ │ + │ │ │ ā””ā–²ā”€ā”€ā–²ā”€ā”€ā”˜ │ + │ │ VLM │ │ │ │ + │ │ │ │ robot state │ + │ │ │ noise │ + │ ā””ā–²ā”€ā”€ā–²ā”€ā”€ā”€ā”€ā”€ā”˜ │ + │ │ │ │ + │ │ image(s) │ + │ language tokens │ + ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + """ + + def __init__(self, config): + super().__init__() + self.config = config + + self.vlm_with_expert = SmolVLMWithExpertModel(model_id=self.config.vlm_model_name, + freeze_vision_encoder=self.config.freeze_vision_encoder, + train_expert_only=self.config.train_expert_only, + attention_implementation=self.config.attention_implementation, + load_vlm_weights=self.config.load_vlm_weights, + attention_mode=self.config.attention_mode, + num_expert_layers=self.config.num_expert_layers, + num_vlm_layers=self.config.num_vlm_layers, + self_attn_every_n_layers=self.config.self_attn_every_n_layers, + expert_width_multiplier=self.config.expert_width_multiplier, + self_attn_only_actions=self.config.self_attn_only_actions, + ) + # self.paligemma_with_expert = self.configure_peft(paligemma_with_expert) + self.vlm_with_expert.configure_peft(config=self.config) + # Projections are float32 + self.state_to_prefix = self.config.state_to_prefix + if self.state_to_prefix: + self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size) + else: + self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.expert_hidden_size) + self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size) + self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim) + + self.action_time_mlp_in = nn.Linear(self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size) + self.action_time_mlp_out = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size) + + self.set_requires_grad() + # SmolVLM2 has: [fake_tok + crop_tok + crop + fake_tok + crop_tok ... + fake_tok + global_tok + global + fake_tok] + [second image] + ... + if any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]): + if "SmolVLM-Instruct" in self.config.vlm_model_name: + self.fake_image_token = 49152 + self.global_image_token = [44, 13906, 29, 6266, 46] + self.global_image_start_token = torch.tensor([self.fake_image_token] + self.global_image_token, dtype=torch.long) + else: + self.fake_image_token = 49189 + self.global_image_token = 49152 + self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long) + else: + self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id + self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id + self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long) + + self.add_image_special_tokens = self.config.add_image_special_tokens + self.add_local_special_image_tokens = self.config.add_local_special_image_tokens + self.local_image_tokens = [torch.tensor([self.fake_image_token, tok], dtype=torch.long) for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167]] # assume 3 x 3 grid + + self.local_image_start_token = self.global_image_start_token + self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long) + self.prefix_length = self.config.prefix_length + self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",") + self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1 + self.causal_attention_on_history = self.config.causal_attention_on_history + + + + + # def configure_peft(self, model): + # # return model + # self.peft_method = self.config.peft_method + # if "lora" in self.peft_method: + # peft_config = self.config.peft_config + # target_modules = peft_config.target_modules + # if not isinstance(target_modules, list): + # target_modules = target_modules.split(",") + # lora_config = LoraConfig( + # task_type=TaskType.CAUSAL_LM, # Based on the task type (e.g., language modeling, etc.) + # r=peft_config.r, # The rank of the low-rank adaptation + # lora_alpha=peft_config.lora_alpha, # Scaling factor + # lora_dropout=peft_config.lora_dropout, # Dropout applied to LoRA layers + # target_modules=target_modules, # The components where LoRA is applied + # exclude_modules=["gemma_expert", "model.gemma_expert.model.layers"], # FIXME(mshukor): this does not work for now + # ) + # # LoraConfig(task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=1, lora_dropout=0, target_modules=["q_proj"], exclude_modules=["gemma_expert"]) + # self.lora_config = lora_config + # # Apply LoRA and ensure only LoRA parameters are trainable + + # model = get_peft_model(model, lora_config) + # assert self.config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here? + # for name, param in model.named_parameters(): + # if ( + # "lora" in name + # ): # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer + # param.requires_grad = True + # return model + + def set_requires_grad(self): + for params in self.state_proj.parameters(): + params.requires_grad = self.config.train_state_proj + + def sample_noise(self, shape, device): + noise = torch.normal( + mean=0.0, + std=1.0, + size=shape, + dtype=torch.float32, + device=device, + ) + return noise + + def sample_time(self, bsize, device): + time_beta = sample_beta(1.5, 1.0, bsize, device) + time = time_beta * 0.999 + 0.001 + return time.to(dtype=torch.float32, device=device) + + def embed_prefix( + self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Embed images with SigLIP and language tokens with embedding layer to prepare + for SmolVLM transformer processing. + """ + # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty + embs = [] + pad_masks = [] + att_masks = [] + num_images = len(images) // self.num_past_images + # TODO: remove for loop + for img_idx, ( + img, + img_mask, + ) in enumerate(zip(images, img_masks, strict=False)): + # FIXME(mshukor): add special tokens for the history each history_steps or not + if self.add_image_special_tokens: + if self.add_local_special_image_tokens and img_idx % num_images != num_images - 1: + local_token_idx = img_idx % num_images + image_start_token = self.vlm_with_expert.embed_language_tokens(self.local_image_tokens[local_token_idx].to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1) + else: + image_start_token = self.vlm_with_expert.embed_language_tokens(self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1) + image_start_mask = torch.ones_like(image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device) + if self.causal_attention_on_history and img_idx % num_images == 0: + att_masks += [1] + [0] * (image_start_mask.shape[-1] - 1) + else: + att_masks += [0] * (image_start_mask.shape[-1]) + embs.append(image_start_token) + pad_masks.append(image_start_mask) + + img_emb = self.vlm_with_expert.embed_image(img) + img_emb = img_emb #.to(dtype=self.vlm_with_expert.type) + + # Normalize image embeddings + img_emb_dim = img_emb.shape[-1] + img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device) + + bsize, num_img_embs = img_emb.shape[:2] + img_mask = img_mask[:, None].expand(bsize, num_img_embs) + + # FIXME(mshukor): add special image tokens. Assume no tiling fake global images fake + # template <|im_start|>User: What actions? image tokens \nAssistant: or processor.apply_chat_template? + # processor.fake_image_token + # processor.global_image_token + + embs.append(img_emb) + pad_masks.append(img_mask) + + att_masks += [0] * (num_img_embs) + if self.add_image_special_tokens: + if not self.add_local_special_image_tokens or (self.add_local_special_image_tokens and img_idx % num_images == num_images - 1): + image_end_token = self.vlm_with_expert.embed_language_tokens(self.image_end_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1) + image_end_mask = torch.ones_like(image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device) + embs.append(image_end_token) + pad_masks.append(image_end_mask) + att_masks += [0] * (image_end_mask.shape[1]) + lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens) + # Normalize language embeddings + lang_emb_dim = lang_emb.shape[-1] + lang_emb = lang_emb * math.sqrt(lang_emb_dim) # FIXME(mshukor): is this needed for smolvlm? + + embs.append(lang_emb) + pad_masks.append(lang_masks) + + # full attention between image and language inputs + num_lang_embs = lang_emb.shape[1] + att_masks += [0] * num_lang_embs + + if state is not None and self.state_to_prefix: + state_emb = self.state_proj(state) + state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type) + embs.append(state_emb) + bsize = state_emb.shape[0] + dtype = state_emb.dtype + device = state_emb.device + + states_seq_len = state_emb.shape[1] + state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device) + pad_masks.append(state_mask) + + # Set attention masks so that image and language inputs do not attend to state or actions + # att_masks += [1] + [0]*(states_seq_len - 1) + att_masks += [1]*(states_seq_len) + embs = torch.cat(embs, dim=1) + pad_masks = torch.cat(pad_masks, dim=1) + att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device) + att_masks = att_masks[None, :] + + seq_len = pad_masks.shape[1] + if seq_len < self.prefix_length: + embs = pad_tensor(embs, self.prefix_length, pad_value=0) + pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0) + att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0) + + att_masks = att_masks.expand(bsize, -1) + + return embs, pad_masks, att_masks + + def embed_suffix(self, state, noisy_actions, timestep): + """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing.""" + embs = [] + pad_masks = [] + att_masks = [] + + # Embed state + if not self.state_to_prefix: + state_emb = self.state_proj(state) + state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type) + embs.append(state_emb) + bsize = state_emb.shape[0] + dtype = state_emb.dtype + device = state_emb.device + + states_seq_len = state_emb.shape[1] + state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device) + pad_masks.append(state_mask) + + # Set attention masks so that image and language inputs do not attend to state or actions + att_masks += [1] + [0]*(states_seq_len - 1) + + + # Fuse timestep + action information using an MLP + action_emb = self.action_in_proj(noisy_actions) + device = action_emb.device + bsize = action_emb.shape[0] + dtype = action_emb.dtype + # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1] + time_emb = create_sinusoidal_pos_embedding( + timestep, self.vlm_with_expert.expert_hidden_size, min_period=4e-3, max_period=4.0, device=device + ) + time_emb = time_emb.type(dtype=dtype) + + time_emb = time_emb[:, None, :].expand_as(action_emb) + action_time_emb = torch.cat([action_emb, time_emb], dim=2) + + action_time_emb = self.action_time_mlp_in(action_time_emb) + action_time_emb = F.silu(action_time_emb) # swish == silu + action_time_emb = self.action_time_mlp_out(action_time_emb) + + # Add to input tokens + embs.append(action_time_emb) + + bsize, action_time_dim = action_time_emb.shape[:2] + action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device) + pad_masks.append(action_time_mask) + + # Set attention masks so that image, language and state inputs do not attend to action tokens + if self.config.causal_action_attention_mask: + att_masks += [1] * self.config.chunk_size + else: + att_masks += [1] + ([0] * (self.config.chunk_size - 1)) + embs = torch.cat(embs, dim=1) + pad_masks = torch.cat(pad_masks, dim=1) + att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device) + att_masks = att_masks[None, :].expand(bsize, len(att_masks)) + return embs, pad_masks, att_masks + + def forward( + self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None + ) -> Tensor: + """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)""" + if noise is None: + noise = self.sample_noise(actions.shape, actions.device) + + if time is None: + time = self.sample_time(actions.shape[0], actions.device) + + time_expanded = time[:, None, None] + if self.config.regression_loss: + # Hack to compare regression to flow matching + time = torch.zeros_like(time, dtype=time.dtype, device=time.device) + x_t = torch.zeros_like(actions, dtype=actions.dtype, device=actions.device) + u_t = actions + else: + x_t = time_expanded * noise + (1 - time_expanded) * actions + u_t = noise - actions + prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( + images, img_masks, lang_tokens, lang_masks, state=state + ) + suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, time) + + pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1) + att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1) + + att_2d_masks = make_att_2d_masks(pad_masks, att_masks) + position_ids = torch.cumsum(pad_masks, dim=1) - 1 + (_, suffix_out), _ = self.vlm_with_expert.forward( + attention_mask=att_2d_masks, + position_ids=position_ids, + past_key_values=None, + inputs_embeds=[prefix_embs, suffix_embs], + use_cache=False, + fill_kv_cache=False, + ) + suffix_out = suffix_out[:, -self.config.chunk_size :] + # Original openpi code, upcast attention output + suffix_out = suffix_out.to(dtype=torch.float32) + v_t = self.action_out_proj(suffix_out) + if self.config.regression_loss: + losses = F.l1_loss(u_t, v_t, reduction="none") + else: + losses = F.mse_loss(u_t, v_t, reduction="none") + return losses + + def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor: + """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)""" + bsize = state.shape[0] + device = state.device + + if noise is None: + actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim) + noise = self.sample_noise(actions_shape, device) + + prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( + images, img_masks, lang_tokens, lang_masks, state=state + ) + prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks) + prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1 + # Compute image and language key value cache + _, past_key_values = self.vlm_with_expert.forward( + attention_mask=prefix_att_2d_masks, + position_ids=prefix_position_ids, + past_key_values=None, + inputs_embeds=[prefix_embs, None], + use_cache=self.config.use_cache, + fill_kv_cache=True, + ) + if self.config.regression_loss: + x_t = torch.zeros_like(noise, dtype=torch.float32, device=device) + expanded_time = torch.zeros(bsize, dtype=torch.float32, device=device) + x_t = self.denoise_step( + state, + prefix_pad_masks, + past_key_values, + x_t, + expanded_time, + ) + else: + dt = -1.0 / self.config.num_steps + dt = torch.tensor(dt, dtype=torch.float32, device=device) + + x_t = noise + time = torch.tensor(1.0, dtype=torch.float32, device=device) + while time >= -dt / 2: + expanded_time = time.expand(bsize) + v_t = self.denoise_step( + state, + prefix_pad_masks, + past_key_values, + x_t, + expanded_time, + ) + + # Euler step + x_t += dt * v_t + time += dt + return x_t + + def denoise_step( + self, + state, + prefix_pad_masks, + past_key_values, + x_t, + timestep, + ): + """Apply one denoising step of the noise `x_t` at a given timestep.""" + suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, timestep) + + suffix_len = suffix_pad_masks.shape[1] + batch_size = prefix_pad_masks.shape[0] + prefix_len = prefix_pad_masks.shape[1] + prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len) + + suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks) + + full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2) + prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None] + position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1 + + outputs_embeds, _ = self.vlm_with_expert.forward( + attention_mask=full_att_2d_masks, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=[None, suffix_embs], + use_cache=self.config.use_cache, + fill_kv_cache=False, + ) + suffix_out = outputs_embeds[1] + suffix_out = suffix_out[:, -self.config.chunk_size :] + suffix_out = suffix_out.to(dtype=torch.float32) + v_t = self.action_out_proj(suffix_out) + return v_t diff --git a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py new file mode 100644 index 000000000..b910679f1 --- /dev/null +++ b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py @@ -0,0 +1,824 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Union +from functools import partial +import copy + +import torch +import torch.version +import torch.nn.functional as F # noqa: N812 +from peft import LoraConfig, TaskType, get_peft_model +from pytest import Cache +from torch import nn +from transformers import ( + AutoConfig, + GemmaForCausalLM, + AutoModelForImageTextToText, + AutoProcessor, + PretrainedConfig, + PreTrainedModel, + SmolVLMForConditionalGeneration, + AutoModel, + AutoModelForVision2Seq, +) +from transformers.models.auto import CONFIG_MAPPING +from transformers import SmolVLMModel, SmolVLMConfig +from lerobot.policies.smolpi0.flex_attention import flex_attention_forward + +def _round_up_to_multiple(x, multiple): + return (x + multiple - 1) // multiple * multiple + + +def apply_rope(x, positions, max_wavelength=10_000): + """ + Applies RoPE positions [B, L] to x [B, L, H, D]. + """ + d_half = x.shape[-1] // 2 + device = x.device + dtype = x.dtype + x = x.to(torch.float32) + + freq_exponents = (2.0 / x.shape[-1]) * torch.arange(d_half, dtype=torch.float32, device=device) + timescale = max_wavelength**freq_exponents + radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(torch.float32) + + radians = radians[..., None, :] + + sin = torch.sin(radians) # .to(dtype=dtype) + cos = torch.cos(radians) # .to(dtype=dtype) + + x1, x2 = x.split(d_half, dim=-1) + res = torch.empty_like(x) + res[..., :d_half] = x1 * cos - x2 * sin + res[..., d_half:] = x2 * cos + x1 * sin + + return res.to(dtype) + + +# class SmolVLMWithExpertConfig(PretrainedConfig): +# model_type = "SmolVLMWithExpertModel" +# sub_configs = {"smolvlm_config": AutoConfig, "lm_expert_config": AutoConfig} + +# def __init__( +# self, +# smolvlm_config: dict | None = None, +# lm_expert_config: dict | None = None, +# freeze_vision_encoder: bool = True, +# train_expert_only: bool = True, +# attention_implementation: str = "eager", +# load_vlm_weights: bool = False, +# **kwargs, +# ): +# self.load_vlm_weights = load_vlm_weights +# self.freeze_vision_encoder = freeze_vision_encoder +# self.train_expert_only = train_expert_only +# self.attention_implementation = attention_implementation + +# if smolvlm_config is None: +# # Default config from Pi0 +# self.smolvlm_config = CONFIG_MAPPING["smolvlm"]( +# transformers_version="4.48.1", +# _vocab_size=257152, +# bos_token_id=2, +# eos_token_id=1, +# hidden_size=2048, +# image_token_index=257152, +# model_type="smolvlm", +# pad_token_id=0, +# projection_dim=2048, +# text_config={ +# "hidden_activation": "gelu_pytorch_tanh", +# "hidden_size": 2048, +# "intermediate_size": 16384, +# "model_type": "gemma", +# "num_attention_heads": 8, +# "num_hidden_layers": 18, +# "num_image_tokens": 256, +# "num_key_value_heads": 1, +# "torch_dtype": "float32", +# "vocab_size": 257152, +# }, +# vision_config={ +# "hidden_size": 1152, +# "intermediate_size": 4304, +# "model_type": "siglip_vision_model", +# "num_attention_heads": 16, +# "num_hidden_layers": 27, +# "num_image_tokens": 256, +# "patch_size": 14, +# "projection_dim": 2048, +# "projector_hidden_act": "gelu_fast", +# "torch_dtype": "float32", +# "vision_use_head": False, +# }, +# ) +# elif isinstance(self.paligemma_config, dict): +# # Override Pi0 default config for PaliGemma +# if "model_type" not in gemma_expert_config: +# paligemma_config["model_type"] = "paligemma" + +# cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]] +# self.paligemma_config = cfg_cls(**paligemma_config) + +# if gemma_expert_config is None: +# # Default config from Pi0 +# self.gemma_expert_config = CONFIG_MAPPING["gemma"]( +# attention_bias=False, +# attention_dropout=0.0, +# bos_token_id=2, +# eos_token_id=1, +# head_dim=256, +# hidden_act="gelu_pytorch_tanh", +# hidden_activation="gelu_pytorch_tanh", +# hidden_size=1024, +# initializer_range=0.02, +# intermediate_size=4096, +# max_position_embeddings=8192, +# model_type="gemma", +# num_attention_heads=8, +# num_hidden_layers=18, +# num_key_value_heads=1, +# pad_token_id=0, +# rms_norm_eps=1e-06, +# rope_theta=10000.0, +# torch_dtype="float32", +# transformers_version="4.48.1", +# use_cache=True, +# vocab_size=257152, +# ) +# elif isinstance(self.gemma_expert_config, dict): +# # Override Pi0 default config for Gemma Expert +# if "model_type" not in gemma_expert_config: +# gemma_expert_config["model_type"] = "gemma" + +# cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]] +# self.gemma_expert_config = cfg_cls(**gemma_expert_config) + +# super().__init__(**kwargs) + +# def __post_init__(self): +# super().__post_init__() +# if self.train_expert_only and not self.freeze_vision_encoder: +# raise ValueError( +# "You set `freeze_vision_encoder=False` and `train_expert_only=True` which are not compatible." +# ) + +# if self.attention_implementation not in ["eager", "fa2", "flex"]: +# raise ValueError( +# f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'." +# ) + +def get_intermediate_size(hidden_dim, ffn_dim_multiplier=4, multiple_of=256): + hidden_dim = int(2 * hidden_dim / 3) + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + return hidden_dim + + +class SmolVLMWithExpertModel(nn.Module): + # config_class = PaliGemmaWithExpertConfig + + def __init__(self, model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + load_vlm_weights: bool = True, train_expert_only: bool = True, freeze_vision_encoder: bool = False, + attention_implementation: str = "eager", attention_mode: str = "self_attn", num_expert_layers: int = -1, + num_vlm_layers: int = -1, self_attn_every_n_layers: int = -1, expert_width_multiplier: float = 0.5, self_attn_only_actions: bool = False): + super().__init__() + if load_vlm_weights: + print(f"Loading {model_id} weights ...") + if "SmolVLM-" in model_id: + self.vlm = AutoModelForVision2Seq.from_pretrained( + model_id, + device_map="cuda", + torch_dtype="bfloat16", + low_cpu_mem_usage=True, + ) + else: + # model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + self.vlm = AutoModelForImageTextToText.from_pretrained( + model_id, + device_map="cuda", + torch_dtype="bfloat16", + low_cpu_mem_usage=True, + # attn_implementation="eager", + # attn_implementation="flash_attention_2" + ) + config = self.vlm.config + else: + config = AutoConfig.from_pretrained(model_id) + self.vlm = SmolVLMForConditionalGeneration(config=config) + self.processor = AutoProcessor.from_pretrained(model_id) + if num_vlm_layers > 0: + print(f"Reducing the number of VLM layers to {num_vlm_layers} ...") + self.get_vlm_model().text_model.layers = self.get_vlm_model().text_model.layers[:num_vlm_layers] + self.num_vlm_layers = len(self.get_vlm_model().text_model.layers) + self.config = config + # Smaller lm expert + lm_expert_config = copy.deepcopy(config.text_config) + hidden_size = lm_expert_config.hidden_size + lm_expert_config.hidden_size = int(hidden_size*expert_width_multiplier) #hidden_size // 2 + lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size*expert_width_multiplier)) + lm_expert_config.num_hidden_layers = self.num_vlm_layers + if num_expert_layers > 0 : + assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}" + lm_expert_config.num_hidden_layers = num_expert_layers + # lm_expert_config.head_dim = lm_expert_config.head_dim * 2 + self.lm_expert = AutoModel.from_config(lm_expert_config) + + self.num_expert_layers = len(self.lm_expert.layers) + self.self_attn_every_n_layers = self_attn_every_n_layers + self.self_attn_only_actions = self_attn_only_actions + if "cross" in attention_mode: + # Reshape qkv projections to have the same input dimension as the vlm + for layer_idx in range(len(self.lm_expert.layers)): + if self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0: + continue + self.lm_expert.layers[layer_idx].self_attn.k_proj = nn.Linear( + config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias + ) + self.lm_expert.layers[layer_idx].self_attn.v_proj = nn.Linear( + config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias + ) + # Remove unused embed_tokens + self.lm_expert.embed_tokens = None + + self.num_attention_heads = self.config.text_config.num_attention_heads + self.num_key_value_heads = self.config.text_config.num_key_value_heads + + self.freeze_vision_encoder = freeze_vision_encoder + self.train_expert_only = train_expert_only + self.attention_implementation = attention_implementation + self.attention_mode = attention_mode + self.expert_hidden_size = lm_expert_config.hidden_size + # self.to_bfloat16_like_physical_intelligence() + self.set_requires_grad() + + def configure_peft(self, config): + # return model + self.peft_method = config.peft_method + self.peft_target_model = config.peft_target_model + if "lora" in self.peft_method: + peft_config = config.peft_config + target_modules = peft_config.target_modules + if not isinstance(target_modules, list): + target_modules = target_modules.split(",") + lora_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, # Based on the task type (e.g., language modeling, etc.) + r=peft_config.r, # The rank of the low-rank adaptation + lora_alpha=peft_config.lora_alpha, # Scaling factor + lora_dropout=peft_config.lora_dropout, # Dropout applied to LoRA layers + target_modules=target_modules, # The components where LoRA is applied + exclude_modules=[ + "lm_expert", + "model.lm_expert.model.layers", + ], # FIXME(mshukor): this does not work for now + ) + self.lora_config = lora_config + # Apply LoRA and ensure only LoRA parameters are trainable + if "text" in self.peft_target_model: + self.get_vlm_model().text_model = get_peft_model(self.get_vlm_model().text_model, lora_config) + else: + self.vlm = get_peft_model(self.vlm, lora_config) + # assert config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here? + for name, param in self.vlm.named_parameters(): + if ( + "lora" in name and "text_model.model.layers.17" not in name + ): # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer + param.requires_grad = True + else: + param.requires_grad = False + + def merge_lora_weights(self): + """ + Merge LoRA weights into the base model. + """ + if "text" in self.peft_target_model: + self.get_vlm_model().text_model = self.get_vlm_model().text_model.merge_and_unload() + else: + self.vlm = self.vlm.merge_and_unload() + + def get_vlm_model(self,): + if hasattr(self.vlm.model, "model"): # When using peft + return self.vlm.model.model + else: + return self.vlm.model + + def set_requires_grad(self): + if self.freeze_vision_encoder: + self.get_vlm_model().vision_model.eval() + for params in self.get_vlm_model().vision_model.parameters(): + params.requires_grad = False + if self.train_expert_only: + self.vlm.eval() + for params in self.vlm.parameters(): + params.requires_grad = False + else: + # To avoid unused params issue with distributed training + last_layers = [self.num_vlm_layers - 1] + if self.num_vlm_layers != self.num_expert_layers and self.num_vlm_layers % self.num_expert_layers == 0: + last_layers.append(self.num_vlm_layers - 2) + frozen_layers = [ + "lm_head", + "text_model.model.norm.weight", + ] + for layer in last_layers: + frozen_layers.append(f"text_model.model.layers.{layer}.") + + for name, params in self.vlm.named_parameters(): + if any( + [ + k in name + for k in frozen_layers + ] + ): + params.requires_grad = False + # To avoid unused params issue with distributed training + for name, params in self.lm_expert.named_parameters(): + if any( + [ + k in name + for k in [ + "lm_head", + ] + ] + ): + params.requires_grad = False + + def train(self, mode: bool = True): + super().train(mode) + + if self.freeze_vision_encoder: + self.get_vlm_model().vision_model.eval() + + if self.train_expert_only: + self.vlm.eval() + + # def to_bfloat16_like_physical_intelligence(self): + # self.vlm = self.vlm.to(dtype=torch.bfloat16) + + # params_to_change_dtype = [ + # "language_model.model.layers", + # "gemma_expert.model.layers", + # "vision_tower", + # "multi_modal", + # ] + # for name, param in self.named_parameters(): + # if any(selector in name for selector in params_to_change_dtype): + # param.data = param.data.to(dtype=torch.bfloat16) + + def embed_image(self, image: torch.Tensor): + patch_attention_mask = None + # # FIXME(mshukor): probably not needed as we don't have padded images here + # pixel_values = image.unsqueeze(1) + # batch_size, num_images, num_channels, height, width = pixel_values.shape + # pixel_values = pixel_values + # pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:]) + + # # Remove padding images - padding images are full 0. + # nb_values_per_image = pixel_values.shape[1:].numel() + # real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image + + # if not any(real_images_inds): + # # no images, leave one empty image. + # real_images_inds[0] = True + + # pixel_values = pixel_values[real_images_inds].contiguous() + + # # Handle the vision attention mask + + # pixel_attention_mask = torch.ones( + # size=[pixel_values.shape[i] for i in (0, 2, 3)], + # dtype=torch.bool, + # device=pixel_values.device, + # ) + + # patch_size = self.vlm.config.vision_config.patch_size + # patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size) + # patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) + # patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + # FIXME(mshukor): add special image tokens specific to smolvlm + # Get sequence from the vision encoder + image_hidden_states = self.get_vlm_model().vision_model( + pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype), + patch_attention_mask=patch_attention_mask, + ).last_hidden_state + # Modality projection & resampling + image_hidden_states = self.get_vlm_model().connector(image_hidden_states) + return image_hidden_states + + def embed_language_tokens(self, tokens: torch.Tensor): + return self.get_vlm_model().text_model.get_input_embeddings()(tokens) + + def forward_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values=None) -> list[torch.Tensor]: + + query_states = [] + key_states = [] + value_states = [] + for i, hidden_states in enumerate(inputs_embeds): + layer = model_layers[i][layer_idx] + if hidden_states is None or layer is None: + continue + + # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype) + # hidden_states = hidden_states * normalizer + hidden_states = layer.input_layernorm(hidden_states) + + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, layer.self_attn.head_dim) + + hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype) + query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape) + key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape) + value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape) + + query_states.append(query_state) + key_states.append(key_state) + value_states.append(value_state) + + # FIXME(mshukor): self attention always when having only the prefix + # B,L,H,D with L sequence length, H number of heads, D head dim + # concatenate on the number of embeddings/tokens + query_states = torch.cat(query_states, dim=1) + key_states = torch.cat(key_states, dim=1) + value_states = torch.cat(value_states, dim=1) + # FIXME(mshukor): seq should be B, H, L, D ? + seq_len = query_states.shape[1] + if seq_len < position_ids.shape[1]: + _position_ids = position_ids[:, :seq_len] + _attention_mask = attention_mask[:, :seq_len, :seq_len] + else: + _position_ids = position_ids + _attention_mask = attention_mask + + if self.self_attn_only_actions: + attention_mask_ = _attention_mask.clone() + position_ids_ = _position_ids.clone() + if inputs_embeds[1] is not None: + suffix_len = inputs_embeds[1].shape[1] + attention_mask_[:, -suffix_len:, :-suffix_len] = False + position_ids_[:, -suffix_len:] = _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None] + else: + attention_mask_ = _attention_mask + position_ids_ = _position_ids + + query_states = apply_rope(query_states, position_ids_) # FIXME(mshukor): this assumes we have always the vlm features? + key_states = apply_rope(key_states, position_ids_) + + if use_cache and past_key_values is None: + past_key_values = {} + + if use_cache: + if fill_kv_cache: + past_key_values[layer_idx] = { + "key_states": key_states, + "value_states": value_states, + } + else: + # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before. + # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach + # the max len, then we (for instance) double the cache size. This implementation already exists + # in `transformers`. (molbap) + key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1) + value_states = torch.cat( + [past_key_values[layer_idx]["value_states"], value_states], dim=1 + ) + + attention_interface = self.get_attention_interface() + + att_output = attention_interface( + attention_mask_, batch_size, head_dim, query_states, key_states, value_states + ) + # att_output = att_output.to(dtype=models[i].dtype) + + return [att_output], past_key_values + + + def forward_cross_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values = None) -> list[torch.Tensor]: + + attention_interface = self.get_attention_interface() + + att_outputs = [] + assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}" + + if len(inputs_embeds) == 2 and not past_key_values: + # Prefix attention + seq_len = inputs_embeds[0].shape[1] + position_id, expert_position_id = position_ids[:, :seq_len], position_ids[:, seq_len:] + prefix_attention_mask = attention_mask[:, :seq_len, :seq_len] + + layer = model_layers[0][layer_idx] + + hidden_states = layer.input_layernorm(inputs_embeds[0]) + + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, layer.self_attn.head_dim) + + hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype) + query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape) + key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape) + value_states = layer.self_attn.v_proj(hidden_states).view(hidden_shape) + + # B,L,H,D with L sequence length, H number of heads, D head dim + query_states = apply_rope(query_state, position_id) + key_states = apply_rope(key_state, position_id) + + att_output = attention_interface( + prefix_attention_mask, batch_size, head_dim, query_states, key_states, value_states + ) + att_outputs.append(att_output) + else: + expert_position_id = position_ids + + if use_cache and past_key_values is None: + past_key_values = {} + + if use_cache: + if fill_kv_cache: + past_key_values[layer_idx] = { + "key_states": key_states, + "value_states": value_states, + } + else: + # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before. + # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach + # the max len, then we (for instance) double the cache size. This implementation already exists + # in `transformers`. (molbap) + key_states = past_key_values[layer_idx]["key_states"] + value_states = past_key_values[layer_idx]["value_states"] + + + # Expert + expert_layer = model_layers[1][layer_idx] + if expert_layer is not None: + expert_hidden_states = expert_layer.input_layernorm(inputs_embeds[1]) + + expert_input_shape = expert_hidden_states.shape[:-1] + expert_hidden_shape = (*expert_input_shape, -1, expert_layer.self_attn.head_dim) + + expert_hidden_states = expert_hidden_states.to(dtype=expert_layer.self_attn.q_proj.weight.dtype) + expert_query_state = expert_layer.self_attn.q_proj(expert_hidden_states).view(expert_hidden_shape) + + + _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view(*key_states.shape[:2], -1) + expert_key_states = expert_layer.self_attn.k_proj(_key_states).view(*_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim) # k_proj should have same dim as kv + + _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view(*value_states.shape[:2], -1) + expert_value_states = expert_layer.self_attn.v_proj(_value_states).view(*_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim) + + expert_position_id = expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values # start from 0 + expert_attention_mask = attention_mask[:, -inputs_embeds[1].shape[1]:, :expert_key_states.shape[1]:] # take into account kv + + expert_query_states = apply_rope(expert_query_state, expert_position_id) + # expert_key_states = apply_rope(expert_key_state, expert_position_id) + + att_output = attention_interface( + expert_attention_mask, batch_size, head_dim, expert_query_states, expert_key_states, expert_value_states + ) + att_outputs.append(att_output) + else: + att_outputs.append(None) + + # att_output = att_output.to(dtype=models[i].dtype) + return att_outputs, past_key_values + + def get_model_layers(self, models: list) -> list: # FIXME(mshukor): is this efficient? + vlm_layers = [] + expert_layers = [] + multiple_of = self.num_vlm_layers // self.num_expert_layers + for i in range(self.num_vlm_layers): + if multiple_of > 0 and i > 0 and i % multiple_of != 0: + expert_layer = None + else: + expert_layer_index = i // multiple_of if multiple_of > 0 else i + expert_layer = models[1].layers[expert_layer_index] + vlm_layers.append(models[0].layers[i]) + expert_layers.append(expert_layer) + return [vlm_layers, expert_layers] + # TODO: break down this huge forward into modules or functions + def forward( + self, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None, + inputs_embeds: List[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + fill_kv_cache: Optional[bool] = None, + ): + models = [self.get_vlm_model().text_model, self.lm_expert] + model_layers = self.get_model_layers(models) + for hidden_states in inputs_embeds: + # TODO this is very inefficient + # dtype is always the same, batch size too (if > 1 len) + # device could be trickier in multi gpu edge cases but that's it + if hidden_states is None: + continue + batch_size = hidden_states.shape[0] + + # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train + if self.attention_implementation == "flex": + if inputs_embeds[0] is not None and inputs_embeds[1] is not None and attention_mask.shape[-1] == attention_mask.shape[-2] and past_key_values is None: # Now only during training + seq_len = inputs_embeds[0].shape[1] + inputs_embeds[1].shape[1] + padded_seq_len = _round_up_to_multiple(seq_len, 128) # FIXME(mshukor): more efficient to have a fixed seq len? + b_mask, q_len, kv_len = attention_mask.shape # The shape of your mask + pad = padded_seq_len - q_len + attention_mask = F.pad(attention_mask, (0, pad, 0, pad), value=True) + inputs_embeds[0] = F.pad(inputs_embeds[0], (0, 0, 0, pad), value=0.0) + position_ids = F.pad(position_ids, (0, pad), value=0) + + + # RMSNorm + num_layers = self.num_vlm_layers + head_dim = self.vlm.config.text_config.head_dim + for layer_idx in range(num_layers): + if fill_kv_cache or "cross" not in self.attention_mode or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0): + att_outputs, past_key_values = self.forward_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values) + else: + att_outputs, past_key_values = self.forward_cross_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values) + # query_states = [] + # key_states = [] + # value_states = [] + # for i, hidden_states in enumerate(inputs_embeds): + # if hidden_states is None: + # continue + # layer = models[i].layers[layer_idx] + # # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype) + # # hidden_states = hidden_states * normalizer + # hidden_states = layer.input_layernorm(hidden_states) + + # input_shape = hidden_states.shape[:-1] + # hidden_shape = (*input_shape, -1, layer.self_attn.head_dim) + + # hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype) + # query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape) + # key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape) + # value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape) + + # query_states.append(query_state) + # key_states.append(key_state) + # value_states.append(value_state) + + # # FIXME(mshukor): self attention always when having only the prefix + # # B,L,H,D with L sequence length, H number of heads, D head dim + # # concatenate on the number of embeddings/tokens + # query_states = torch.cat(query_states, dim=1) + # key_states = torch.cat(key_states, dim=1) + # value_states = torch.cat(value_states, dim=1) + # # FIXME(mshukor): seq should be B, H, L, D ? + # query_states = apply_rope(query_states, position_ids) + # key_states = apply_rope(key_states, position_ids) + + # if use_cache and past_key_values is None: + # past_key_values = {} + + # if use_cache: + # if fill_kv_cache: + # past_key_values[layer_idx] = { + # "key_states": key_states, + # "value_states": value_states, + # } + # else: + # # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before. + # # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach + # # the max len, then we (for instance) double the cache size. This implementation already exists + # # in `transformers`. (molbap) + # key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1) + # value_states = torch.cat( + # [past_key_values[layer_idx]["value_states"], value_states], dim=1 + # ) + + # attention_interface = self.get_attention_interface() + # att_output = attention_interface( + # attention_mask, batch_size, head_dim, query_states, key_states, value_states + # ) + + + # att_output = att_output.to(dtype=models[i].dtype) + + # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len]) + outputs_embeds = [] + start = 0 + for i, hidden_states in enumerate(inputs_embeds): + # layer = models[i].layers[layer_idx] + layer = model_layers[i][layer_idx] + att_output = att_outputs[i] if i < len(att_outputs) else att_outputs[0] # in case of self_attn + if hidden_states is not None: + if layer is None: + outputs_embeds.append(hidden_states) + continue + end = start + hidden_states.shape[1] + + if att_output.dtype != layer.self_attn.o_proj.weight.dtype: + att_output = att_output.to(layer.self_attn.o_proj.weight.dtype) + att_out = att_output[:, start:end] + out_emb = layer.self_attn.o_proj(att_out) + + # TODO: first dropout (by default 0.0) + # first residual + out_emb += hidden_states + after_first_residual = out_emb.clone() + + out_emb = layer.post_attention_layernorm(out_emb) + out_emb = layer.mlp(out_emb) + + # TODO: second dropout (by default 0.0) + + # second residual + out_emb += after_first_residual + + outputs_embeds.append(out_emb) + + start = end if len(att_outputs) == 1 else 0 + else: + outputs_embeds.append(None) + + inputs_embeds = outputs_embeds + + # final norm + outputs_embeds = [] + for i, hidden_states in enumerate(inputs_embeds): + if hidden_states is not None: + out_emb = models[i].norm(hidden_states) + outputs_embeds.append(out_emb) + else: + outputs_embeds.append(None) + return outputs_embeds, past_key_values + + def get_attention_interface(self): + if self.attention_implementation == "fa2": + attention_interface = self.flash_attention_forward + elif self.attention_implementation == "flex": + attention_interface = partial(flex_attention_forward, num_att_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads) + else: + attention_interface = self.eager_attention_forward + return attention_interface + + def flash_attention_forward( + self, attention_mask, batch_size, head_dim, query_states, key_states, value_states + ): + raise NotImplementedError("FA2 is not implemented (yet)") + + def eager_attention_forward( + self, attention_mask, batch_size, head_dim, query_states, key_states, value_states + ): + num_att_heads = self.num_attention_heads + num_key_value_heads = self.num_key_value_heads + num_key_value_groups = num_att_heads // num_key_value_heads + + # query_states: batch_size, sequence_length, num_att_head, head_dim + # key_states: batch_size, sequence_length, num_key_value_head, head_dim + # value_states: batch_size, sequence_length, num_key_value_head, head_dim + sequence_length = key_states.shape[1] + + key_states = key_states[:, :, :, None, :].expand( + batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim + ) + key_states = key_states.reshape( + batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim + ) + + value_states = value_states[:, :, :, None, :].expand( + batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim + ) + value_states = value_states.reshape( + batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim + ) + + # Attention here is upcasted to float32 to match the original eager implementation. + + query_states = query_states.to(dtype=torch.float32) + key_states = key_states.to(dtype=torch.float32) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + + att_weights = torch.matmul(query_states, key_states.transpose(2, 3)) + att_weights *= head_dim**-0.5 + + att_weights = att_weights.to(dtype=torch.float32) + big_neg = torch.finfo(att_weights.dtype).min #-2.3819763e38 # See gemma/modules.py + masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg) + probs = nn.functional.softmax(masked_att_weights, dim=-1) + probs = probs.to(dtype=value_states.dtype) + + # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length + # value_states: batch_size, sequence_length, num_att_heads, head_dim + + att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3)) + + att_output = att_output.permute(0, 2, 1, 3) + # we use -1 because sequence length can change + att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim) + + return att_output diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py index 95ed993d2..9b7e3520a 100644 --- a/src/lerobot/policies/smolvla/modeling_smolvla.py +++ b/src/lerobot/policies/smolvla/modeling_smolvla.py @@ -1,3 +1,955 @@ +# #!/usr/bin/env python + +# # Copyright 2025 HuggingFace Inc. team. All rights reserved. +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +# """ +# SmolVLA: + +# [Paper](https://huggingface.co/papers/2506.01844) + +# Designed by Hugging Face. + +# Install smolvla extra dependencies: +# ```bash +# pip install -e ".[smolvla]" +# ``` + +# Example of finetuning the smolvla pretrained model (`smolvla_base`): +# ```bash +# lerobot-train \ +# --policy.path=lerobot/smolvla_base \ +# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \ +# --batch_size=64 \ +# --steps=200000 +# ``` + +# Example of finetuning a smolVLA. SmolVLA is composed of a pretrained VLM, +# and an action expert. +# ```bash +# lerobot-train \ +# --policy.type=smolvla \ +# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \ +# --batch_size=64 \ +# --steps=200000 +# ``` + +# Example of using the smolvla pretrained model outside LeRobot training framework: +# ```python +# policy = SmolVLAPolicy.from_pretrained("lerobot/smolvla_base") +# ``` + +# """ + +# import math +# import os +# import re +# from collections import deque + +# import safetensors +# import torch +# import torch.nn.functional as F # noqa: N812 +# from torch import Tensor, nn +# from transformers import AutoProcessor + +# from lerobot.constants import ACTION +# from lerobot.policies.normalize import ( +# Normalize, +# Unnormalize, +# ) +# from lerobot.policies.pretrained import PreTrainedPolicy +# from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig +# from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel +# from lerobot.policies.utils import ( +# populate_queues, +# ) +# from lerobot.utils.utils import get_safe_dtype +# OBS_STATE = 'state' +# ACTION = 'actions' +# # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker +# _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") + + +# def canonicalise(k: str) -> str: +# """ +# Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a +# normalisation-buffer key. +# """ +# return _VARIANT_RE.sub(".buffer_", k) + + +# def standardise_state_dict( +# checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True +# ) -> tuple[dict[str, torch.Tensor], list[str]]: +# """ +# • Re-keys `checkpoint ` so that every entry matches the *reference* key set. +# • If several variant keys collapse to the same canonical name we keep the +# first one and log the collision. +# • Returns the new dict + a list of entries that could not be matched. +# """ +# out, collisions, unmatched = {}, {}, [] + +# for k, v in checkpoint.items(): +# canon = canonicalise(k) +# if canon in ref_keys: +# if canon in out: # duplicate after collapsing +# collisions.setdefault(canon, []).append(k) +# else: +# out[canon] = v +# else: +# unmatched.append(k) + +# if verbose: +# for canon, variants in collisions.items(): +# print(f"[standardise_state_dict] '{canon}' ← {variants}") +# if unmatched: +# print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys") + +# out.update({k: checkpoint[k] for k in unmatched}) +# return out, unmatched + + +# def rename_checkpoint_keys(checkpoint: dict, rename_str: str): +# """ +# Renames keys in a checkpoint dictionary based on the given rename string. + +# Args: +# checkpoint (dict): The checkpoint dictionary. +# rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2". + +# Returns: +# dict: The modified checkpoint with renamed keys. +# """ + +# rename_dict = dict(pair.split("//") for pair in rename_str.split(",")) + +# new_checkpoint = {} +# for k, v in checkpoint.items(): +# for old_key, new_key in rename_dict.items(): +# if old_key in k: +# k = k.replace(old_key, new_key) +# new_checkpoint[k] = v +# return new_checkpoint + + +# def load_smolvla( +# model: torch.nn.Module, +# filename: str | os.PathLike, +# *, +# device: str = "cpu", +# checkpoint_keys_mapping: str = "", +# ) -> torch.nn.Module: +# state_dict = safetensors.torch.load_file(filename, device=device) + +# # Optional user-supplied renames (e.g. "model._orig_mod.//model.") +# if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping: +# state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping) + +# state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys())) + +# # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset +# norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs") +# state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)} + +# missing, unexpected = model.load_state_dict(state_dict, strict=False) + +# if not all(key.startswith(norm_keys) for key in missing) or unexpected: +# raise RuntimeError( +# "SmolVLA %d missing / %d unexpected keys", +# len(missing), +# len(unexpected), +# ) + +# return model + + +# def create_sinusoidal_pos_embedding( +# time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu" +# ) -> Tensor: +# """Computes sine-cosine positional embedding vectors for scalar positions.""" +# if dimension % 2 != 0: +# raise ValueError(f"dimension ({dimension}) must be divisible by 2") + +# if time.ndim != 1: +# raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.") + +# dtype = get_safe_dtype(torch.float64, device.type) +# fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device) +# period = min_period * (max_period / min_period) ** fraction + +# # Compute the outer product +# scaling_factor = 1.0 / period * 2 * math.pi +# sin_input = scaling_factor[None, :] * time[:, None] +# pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) +# return pos_emb + + +# def make_att_2d_masks(pad_masks, att_masks): +# """Copied from big_vision. + +# Tokens can attend to valid inputs tokens which have a cumulative mask_ar +# smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to +# setup several types of attention, for example: + +# [[1 1 1 1 1 1]]: pure causal attention. + +# [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between +# themselves and the last 3 tokens have a causal attention. The first +# entry could also be a 1 without changing behaviour. + +# [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a +# block can attend all previous blocks and all tokens on the same block. + +# Args: +# input_mask: bool[B, N] true if its part of the input, false if padding. +# mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on +# it and 0 where it shares the same attention mask as the previous token. +# """ +# if att_masks.ndim != 2: +# raise ValueError(att_masks.ndim) +# if pad_masks.ndim != 2: +# raise ValueError(pad_masks.ndim) + +# cumsum = torch.cumsum(att_masks, dim=1) +# att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None] +# pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None] +# att_2d_masks = att_2d_masks & pad_2d_masks +# return att_2d_masks + + +# def resize_with_pad(img, width, height, pad_value=-1): +# # assume no-op when width height fits already +# if img.ndim != 4: +# raise ValueError(f"(b,c,h,w) expected, but {img.shape}") + +# cur_height, cur_width = img.shape[2:] + +# ratio = max(cur_width / width, cur_height / height) +# resized_height = int(cur_height / ratio) +# resized_width = int(cur_width / ratio) +# resized_img = F.interpolate( +# img, size=(resized_height, resized_width), mode="bilinear", align_corners=False +# ) + +# pad_height = max(0, int(height - resized_height)) +# pad_width = max(0, int(width - resized_width)) + +# # pad on left and top of image +# padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value) +# return padded_img + + +# def pad_vector(vector, new_dim): +# """Can be (batch_size x sequence_length x features_dimension) +# or (batch_size x features_dimension) +# """ +# if vector.shape[-1] == new_dim: +# return vector +# shape = list(vector.shape) +# current_dim = shape[-1] +# shape[-1] = new_dim +# new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device) +# new_vector[..., :current_dim] = vector +# return new_vector + + +# def normalize(x, min_val, max_val): +# return (x - min_val) / (max_val - min_val) + + +# def unnormalize(x, min_val, max_val): +# return x * (max_val - min_val) + min_val + + +# def safe_arcsin(value): +# # This ensures that the input stays within +# # [āˆ’1,1] to avoid invalid values for arcsin +# return torch.arcsin(torch.clamp(value, -1.0, 1.0)) + + +# def aloha_gripper_to_angular(value): +# # Aloha transforms the gripper positions into a linear space. The following code +# # reverses this transformation to be consistent with smolvla which is pretrained in +# # angular space. +# # +# # These values are coming from the Aloha code: +# # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED +# value = unnormalize(value, min_val=0.01844, max_val=0.05800) + +# # This is the inverse of the angular to linear transformation inside the Interbotix code. +# def linear_to_radian(linear_position, arm_length, horn_radius): +# value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position) +# return safe_arcsin(value) + +# # The constants are taken from the Interbotix code. +# value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022) + +# # Normalize to [0, 1]. +# # The values 0.4 and 1.5 were measured on an actual Trossen robot. +# return normalize(value, min_val=0.4, max_val=1.5) + + +# def aloha_gripper_from_angular(value): +# # Convert from the gripper position used by smolvla to the gripper position that is used by Aloha. +# # Note that the units are still angular but the range is different. + +# # The values 0.4 and 1.5 were measured on an actual Trossen robot. +# value = unnormalize(value, min_val=0.4, max_val=1.5) + +# # These values are coming from the Aloha code: +# # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE +# return normalize(value, min_val=-0.6213, max_val=1.4910) + + +# def aloha_gripper_from_angular_inv(value): +# # Directly inverts the gripper_from_angular function. +# value = unnormalize(value, min_val=-0.6213, max_val=1.4910) +# return normalize(value, min_val=0.4, max_val=1.5) + + +# class SmolVLAPolicy(PreTrainedPolicy): +# """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot.""" + +# config_class = SmolVLAConfig +# name = "smolvla" + +# def __init__( +# self, +# config: SmolVLAConfig, +# dataset_stats: dict[str, dict[str, Tensor]] | None = None, +# ): +# """ +# Args: +# config: Policy configuration class instance or None, in which case the default instantiation of +# the configuration class is used. +# dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected +# that they will be passed with a call to `load_state_dict` before the policy is used. +# """ + +# super().__init__(config) +# config.validate_features() +# self.config = config +# self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats) +# self.normalize_targets = Normalize( +# config.output_features, config.normalization_mapping, dataset_stats +# ) +# self.unnormalize_outputs = Unnormalize( +# config.output_features, config.normalization_mapping, dataset_stats +# ) + +# self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer +# self.model = VLAFlowMatching(config) +# self.reset() + +# def reset(self): +# """This should be called whenever the environment is reset.""" +# self._queues = { +# ACTION: deque(maxlen=self.config.n_action_steps), +# } + +# # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues +# @classmethod +# def _load_as_safetensor( +# cls, +# model: "SmolVLAPolicy", +# model_file: str, +# map_location: str, +# strict: bool, +# ): +# safetensors.torch.load_model(model, model_file, strict=strict, device=map_location) +# return load_smolvla( +# model, +# model_file, +# device=map_location, +# checkpoint_keys_mapping="model._orig_mod.//model.", +# ) + +# def get_optim_params(self) -> dict: +# return self.parameters() + +# def _get_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: +# # TODO: Check if this for loop is needed. +# # Context: In fact, self.queues contains only ACTION field, and in inference, we don't have action in the batch +# # In the case of offline inference, we have the action in the batch +# # that why without the k != ACTION check, it will raise an error because we are trying to stack +# # on an empty container. +# for k in batch: +# if k in self._queues and k != ACTION: +# batch[k] = torch.stack(list(self._queues[k]), dim=1) + +# images, img_masks = self.prepare_images(batch) +# state = self.prepare_state(batch) +# lang_tokens, lang_masks = self.prepare_language(batch) + +# actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise) + +# # Unpad actions +# original_action_dim = self.config.action_feature.shape[0] +# actions = actions[:, :, :original_action_dim] + +# actions = self.unnormalize_outputs({ACTION: actions})[ACTION] + +# if self.config.adapt_to_pi_aloha: +# actions = self._pi_aloha_encode_actions(actions) + +# return actions + +# def _prepare_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: +# if self.config.adapt_to_pi_aloha: +# batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) + +# batch = self.normalize_inputs(batch) + +# return batch + +# @torch.no_grad() +# def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: +# self.eval() + +# batch = self._prepare_batch(batch) +# self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) + +# actions = self._get_action_chunk(batch, noise) +# return actions + +# @torch.no_grad() +# def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: +# """Select a single action given environment observations. + +# This method wraps `select_actions` in order to return one action at a time for execution in the +# environment. It works by managing the actions in a queue and only calling `select_actions` when the +# queue is empty. +# """ +# self.eval() +# batch = self._prepare_batch(batch) +# self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) + +# # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by +# # querying the policy. +# if len(self._queues[ACTION]) == 0: +# actions = self._get_action_chunk(batch, noise) + +# # `self.predict_action_chunk` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue +# # effectively has shape (n_action_steps, batch_size, *), hence the transpose. +# self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps]) + +# return self._queues[ACTION].popleft() + +# def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]: +# """Do a full training forward pass to compute the loss""" +# if self.config.adapt_to_pi_aloha: +# batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) +# batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION]) +# batch = self.normalize_inputs(batch) +# batch = self.normalize_targets(batch) +# images, img_masks = self.prepare_images(batch) +# state = self.prepare_state(batch) +# lang_tokens, lang_masks = self.prepare_language(batch) +# actions = self.prepare_action(batch) +# actions_is_pad = batch.get("actions_id_pad") +# loss_dict = {} +# losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) +# loss_dict["losses_after_forward"] = losses.clone() + +# if actions_is_pad is not None: +# in_episode_bound = ~actions_is_pad +# losses = losses * in_episode_bound.unsqueeze(-1) +# loss_dict["losses_after_in_ep_bound"] = losses.clone() + +# # Remove padding +# losses = losses[:, :, : self.config.max_action_dim] +# loss_dict["losses_after_rm_padding"] = losses.clone() + +# # For backward pass +# loss = losses.mean() +# # For backward pass +# loss_dict["loss"] = loss.item() +# return loss, loss_dict + +# def prepare_images(self, batch): +# """Apply SmolVLA preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and +# convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP. +# """ +# images = [] +# img_masks = [] +# present_img_keys = [key for key in self.config.image_features if key in batch] +# missing_img_keys = [key for key in self.config.image_features if key not in batch] + +# if len(present_img_keys) == 0: +# raise ValueError( +# f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})" +# ) +# # Preprocess image features present in the batch +# for key in present_img_keys: +# img = batch[key][:, -1, :, :, :] if batch[key].ndim == 5 else batch[key] +# if self.config.resize_imgs_with_padding is not None: +# img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0) + +# # Normalize from range [0,1] to [-1,1] as expacted by siglip +# img = img * 2.0 - 1.0 + +# bsize = img.shape[0] +# device = img.device +# if f"{key}_padding_mask" in batch: +# mask = batch[f"{key}_padding_mask"].bool() +# else: +# mask = torch.ones(bsize, dtype=torch.bool, device=device) +# images.append(img) +# img_masks.append(mask) + +# # Create image features not present in the batch +# # as fully 0 padded images. +# for num_empty_cameras in range(len(missing_img_keys)): +# if num_empty_cameras >= self.config.empty_cameras: +# break +# img = torch.ones_like(img) * -1 +# mask = torch.zeros_like(mask) +# images.append(img) +# img_masks.append(mask) +# return images, img_masks + +# def prepare_language(self, batch) -> tuple[Tensor, Tensor]: +# """Tokenize the text input""" +# device = batch[OBS_STATE].device +# tasks = batch["task"] +# if isinstance(tasks, str): +# tasks = [tasks] + +# if len(tasks) == 1: +# tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])] + +# tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks] + +# tokenized_prompt = self.language_tokenizer.__call__( +# tasks, +# padding=self.config.pad_language_to, +# padding_side="right", +# max_length=self.config.tokenizer_max_length, +# return_tensors="pt", +# ) +# lang_tokens = tokenized_prompt["input_ids"].to(device=device) +# lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool) + +# return lang_tokens, lang_masks + +# def _pi_aloha_decode_state(self, state): +# # Flip the joints. +# for motor_idx in [1, 2, 8, 9]: +# state[:, motor_idx] *= -1 +# # Reverse the gripper transformation that is being applied by the Aloha runtime. +# for motor_idx in [6, 13]: +# state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx]) +# return state + +# def _pi_aloha_encode_actions(self, actions): +# # Flip the joints. +# for motor_idx in [1, 2, 8, 9]: +# actions[:, :, motor_idx] *= -1 +# # Reverse the gripper transformation that is being applied by the Aloha runtime. +# for motor_idx in [6, 13]: +# actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx]) +# return actions + +# def _pi_aloha_encode_actions_inv(self, actions): +# # Flip the joints again. +# for motor_idx in [1, 2, 8, 9]: +# actions[:, :, motor_idx] *= -1 +# # Reverse the gripper transformation that is being applied by the Aloha runtime. +# for motor_idx in [6, 13]: +# actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx]) +# return actions + +# def prepare_state(self, batch): +# """Pad state""" +# state = batch[OBS_STATE][:, -1, :] if batch[OBS_STATE].ndim > 2 else batch[OBS_STATE] +# state = pad_vector(state, self.config.max_state_dim) +# return state + +# def prepare_action(self, batch): +# """Pad action""" +# actions = pad_vector(batch[ACTION], self.config.max_action_dim) +# return actions + + +# def pad_tensor(tensor, max_len, pad_value=0): +# """ +# Efficiently pads a tensor along sequence dimension to match max_len. + +# Args: +# tensor (torch.Tensor): Shape (B, L, ...) or (B, L). +# max_len (int): Fixed sequence length. +# pad_value (int/float): Value for padding. + +# Returns: +# torch.Tensor: Shape (B, max_len, ...) or (B, max_len). +# """ +# b, d = tensor.shape[:2] + +# # Create a padded tensor of max_len and copy the existing values +# padded_tensor = torch.full( +# (b, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device +# ) +# padded_tensor[:, :d] = tensor # Efficient in-place copy + +# return padded_tensor + + +# class VLAFlowMatching(nn.Module): +# """ +# SmolVLA + +# [Paper]() + +# Designed by Hugging Face. +# ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +# │ actions │ +# │ ā–² │ +# │ ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€|────┐ │ +# │ | │────► │ │ │ +# │ | │ kv │ │ │ +# │ | │────► │Action│ │ +# │ | VLM │cache │Expert│ | +# │ │ │────► | │ │ +# │ │ │ │ │ │ +# │ ā””ā–²ā”€ā”€ā–²ā”€ā”€ā”€ā–²ā”€ā”˜ ā””ā”€ā”€ā”€ā–²ā”€ā”€ā”˜ | +# │ │ | | │ | +# │ | | | noise │ +# │ │ │ state │ +# │ │ language tokens │ +# │ image(s) │ +# ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +# """ + +# def __init__(self, config: SmolVLAConfig): +# super().__init__() +# self.config = config + +# self.vlm_with_expert = SmolVLMWithExpertModel( +# model_id=self.config.vlm_model_name, +# freeze_vision_encoder=self.config.freeze_vision_encoder, +# train_expert_only=self.config.train_expert_only, +# load_vlm_weights=self.config.load_vlm_weights, +# attention_mode=self.config.attention_mode, +# num_expert_layers=self.config.num_expert_layers, +# num_vlm_layers=self.config.num_vlm_layers, +# self_attn_every_n_layers=self.config.self_attn_every_n_layers, +# expert_width_multiplier=self.config.expert_width_multiplier, +# ) +# self.state_proj = nn.Linear( +# self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size +# ) +# self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size) +# self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim) + +# self.action_time_mlp_in = nn.Linear( +# self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size +# ) +# self.action_time_mlp_out = nn.Linear( +# self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size +# ) + +# self.set_requires_grad() +# self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id +# self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id +# self.global_image_start_token = torch.tensor( +# [self.fake_image_token, self.global_image_token], dtype=torch.long +# ) + +# self.add_image_special_tokens = self.config.add_image_special_tokens +# self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long) +# self.prefix_length = self.config.prefix_length + +# def set_requires_grad(self): +# for params in self.state_proj.parameters(): +# params.requires_grad = self.config.train_state_proj + +# def sample_noise(self, shape, device): +# noise = torch.normal( +# mean=0.0, +# std=1.0, +# size=shape, +# dtype=torch.float32, +# device=device, +# ) +# return noise + +# def sample_time(self, bsize, device): +# beta_dist = torch.distributions.Beta(concentration1=1.5, concentration0=1.0) +# time_beta = beta_dist.sample((bsize,)).to(device=device, dtype=torch.float32) +# time = time_beta * 0.999 + 0.001 +# return time + +# def embed_prefix( +# self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None +# ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: +# """Embed images with SigLIP and language tokens with embedding layer to prepare +# for SmolVLM transformer processing. +# """ +# embs = [] +# pad_masks = [] +# att_masks = [] +# for _img_idx, ( +# img, +# img_mask, +# ) in enumerate(zip(images, img_masks, strict=False)): +# if self.add_image_special_tokens: +# image_start_token = ( +# self.vlm_with_expert.embed_language_tokens( +# self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device) +# ) +# .unsqueeze(0) +# .expand(img.shape[0], -1, -1) +# ) +# image_start_mask = torch.ones_like( +# image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device +# ) +# att_masks += [0] * (image_start_mask.shape[-1]) +# embs.append(image_start_token) +# pad_masks.append(image_start_mask) + +# img_emb = self.vlm_with_expert.embed_image(img) +# img_emb = img_emb + +# # Normalize image embeddings +# img_emb_dim = img_emb.shape[-1] +# img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device) + +# bsize, num_img_embs = img_emb.shape[:2] +# img_mask = img_mask[:, None].expand(bsize, num_img_embs) + +# embs.append(img_emb) +# pad_masks.append(img_mask) + +# att_masks += [0] * (num_img_embs) +# if self.add_image_special_tokens: +# image_end_token = ( +# self.vlm_with_expert.embed_language_tokens( +# self.image_end_token.to(device=self.vlm_with_expert.vlm.device) +# ) +# .unsqueeze(0) +# .expand(img.shape[0], -1, -1) +# ) +# image_end_mask = torch.ones_like( +# image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device +# ) +# embs.append(image_end_token) +# pad_masks.append(image_end_mask) +# att_masks += [0] * (image_end_mask.shape[1]) +# lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens) +# # Normalize language embeddings +# lang_emb_dim = lang_emb.shape[-1] +# lang_emb = lang_emb * math.sqrt(lang_emb_dim) + +# embs.append(lang_emb) +# pad_masks.append(lang_masks) + +# num_lang_embs = lang_emb.shape[1] +# att_masks += [0] * num_lang_embs + +# state_emb = self.state_proj(state) +# state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb +# embs.append(state_emb) +# bsize = state_emb.shape[0] +# device = state_emb.device + +# states_seq_len = state_emb.shape[1] +# state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device) +# pad_masks.append(state_mask) + +# # Set attention masks so that image and language inputs do not attend to state or actions +# att_masks += [1] * (states_seq_len) +# embs = torch.cat(embs, dim=1) +# pad_masks = torch.cat(pad_masks, dim=1) +# att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device) +# att_masks = att_masks[None, :] + +# seq_len = pad_masks.shape[1] +# if seq_len < self.prefix_length: +# embs = pad_tensor(embs, self.prefix_length, pad_value=0) +# pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0) +# att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0) + +# att_masks = att_masks.expand(bsize, -1) + +# return embs, pad_masks, att_masks + +# def embed_suffix(self, noisy_actions, timestep): +# """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing.""" +# embs = [] +# pad_masks = [] +# att_masks = [] + +# # Fuse timestep + action information using an MLP +# action_emb = self.action_in_proj(noisy_actions) +# device = action_emb.device +# bsize = action_emb.shape[0] +# dtype = action_emb.dtype +# # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1] +# time_emb = create_sinusoidal_pos_embedding( +# timestep, +# self.vlm_with_expert.expert_hidden_size, +# self.config.min_period, +# self.config.max_period, +# device=device, +# ) +# time_emb = time_emb.type(dtype=dtype) + +# time_emb = time_emb[:, None, :].expand_as(action_emb) +# action_time_emb = torch.cat([action_emb, time_emb], dim=2) + +# action_time_emb = self.action_time_mlp_in(action_time_emb) +# action_time_emb = F.silu(action_time_emb) # swish == silu +# action_time_emb = self.action_time_mlp_out(action_time_emb) + +# # Add to input tokens +# embs.append(action_time_emb) + +# bsize, action_time_dim = action_time_emb.shape[:2] +# action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device) +# pad_masks.append(action_time_mask) + +# # Set attention masks so that image, language and state inputs do not attend to action tokens +# att_masks += [1] * self.config.chunk_size +# embs = torch.cat(embs, dim=1) +# pad_masks = torch.cat(pad_masks, dim=1) +# att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device) +# att_masks = att_masks[None, :].expand(bsize, len(att_masks)) +# # added by jade +# seq_len = pad_masks.shape[1] +# if seq_len < self.config.chunk_size: +# embs = pad_tensor(embs, self.config.chunk_size, pad_value=0) +# pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0) +# att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0) +# return embs, pad_masks, att_masks + +# def forward( +# self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None +# ) -> Tensor: +# """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)""" +# #added by jade +# if actions.ndim == 2: +# actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1) +# if noise is None: +# noise = self.sample_noise(actions.shape, actions.device) + +# if time is None: +# time = self.sample_time(actions.shape[0], actions.device) + +# time_expanded = time[:, None, None] +# x_t = time_expanded * noise + (1 - time_expanded) * actions +# u_t = noise - actions +# prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( +# images, img_masks, lang_tokens, lang_masks, state=state +# ) +# suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, time) + +# pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1) +# att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1) + +# att_2d_masks = make_att_2d_masks(pad_masks, att_masks) +# position_ids = torch.cumsum(pad_masks, dim=1) - 1 +# (_, suffix_out), _ = self.vlm_with_expert.forward( +# attention_mask=att_2d_masks, +# position_ids=position_ids, +# past_key_values=None, +# inputs_embeds=[prefix_embs, suffix_embs], +# use_cache=False, +# fill_kv_cache=False, +# ) +# # suffix_out = suffix_out[:, -self.config.chunk_size :] +# suffix_out = suffix_out[:, -self.config.chunk_size:, :] +# # Original openpi code, upcast attention output +# suffix_out = suffix_out.to(dtype=torch.float32) +# v_t = self.action_out_proj(suffix_out) +# losses = F.mse_loss(u_t, v_t, reduction="none") +# return losses + +# def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor: +# """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)""" +# bsize = state.shape[0] +# device = state.device + +# if noise is None: +# actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim) +# noise = self.sample_noise(actions_shape, device) + +# prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( +# images, img_masks, lang_tokens, lang_masks, state=state +# ) +# prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks) +# prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1 +# # Compute image and language key value cache +# _, past_key_values = self.vlm_with_expert.forward( +# attention_mask=prefix_att_2d_masks, +# position_ids=prefix_position_ids, +# past_key_values=None, +# inputs_embeds=[prefix_embs, None], +# use_cache=self.config.use_cache, +# fill_kv_cache=True, +# ) +# dt = -1.0 / self.config.num_steps +# dt = torch.tensor(dt, dtype=torch.float32, device=device) + +# x_t = noise +# time = torch.tensor(1.0, dtype=torch.float32, device=device) +# while time >= -dt / 2: +# expanded_time = time.expand(bsize) +# v_t = self.denoise_step( +# prefix_pad_masks, +# past_key_values, +# x_t, +# expanded_time, +# ) +# # Euler step +# x_t += dt * v_t +# time += dt +# return x_t + +# def denoise_step( +# self, +# prefix_pad_masks, +# past_key_values, +# x_t, +# timestep, +# ): +# """Apply one denoising step of the noise `x_t` at a given timestep.""" +# suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, timestep) + +# suffix_len = suffix_pad_masks.shape[1] +# batch_size = prefix_pad_masks.shape[0] +# prefix_len = prefix_pad_masks.shape[1] +# prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len) + +# suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks) + +# full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2) +# prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None] +# position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1 + +# outputs_embeds, _ = self.vlm_with_expert.forward( +# attention_mask=full_att_2d_masks, +# position_ids=position_ids, +# past_key_values=past_key_values, +# inputs_embeds=[None, suffix_embs], +# use_cache=self.config.use_cache, +# fill_kv_cache=False, +# ) +# suffix_out = outputs_embeds[1] +# suffix_out = suffix_out[:, -self.config.chunk_size :] +# suffix_out = suffix_out.to(dtype=torch.float32) +# v_t = self.action_out_proj(suffix_out) +# return v_t #!/usr/bin/env python # Copyright 2025 HuggingFace Inc. team. All rights reserved. @@ -63,7 +1015,7 @@ import torch.nn.functional as F # noqa: N812 from torch import Tensor, nn from transformers import AutoProcessor -from lerobot.constants import ACTION +from lerobot.constants import ACTION, OBS_STATE from lerobot.policies.normalize import ( Normalize, Unnormalize, @@ -75,8 +1027,7 @@ from lerobot.policies.utils import ( populate_queues, ) from lerobot.utils.utils import get_safe_dtype -OBS_STATE = 'state' -ACTION = 'actions' + # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") @@ -825,21 +1776,12 @@ class VLAFlowMatching(nn.Module): pad_masks = torch.cat(pad_masks, dim=1) att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device) att_masks = att_masks[None, :].expand(bsize, len(att_masks)) - # added by jade - seq_len = pad_masks.shape[1] - if seq_len < self.config.chunk_size: - embs = pad_tensor(embs, self.config.chunk_size, pad_value=0) - pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0) - att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0) return embs, pad_masks, att_masks def forward( self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None ) -> Tensor: """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)""" - #added by jade - if actions.ndim == 2: - actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1) if noise is None: noise = self.sample_noise(actions.shape, actions.device) @@ -867,8 +1809,7 @@ class VLAFlowMatching(nn.Module): use_cache=False, fill_kv_cache=False, ) - # suffix_out = suffix_out[:, -self.config.chunk_size :] - suffix_out = suffix_out[:, -self.config.chunk_size:, :] + suffix_out = suffix_out[:, -self.config.chunk_size :] # Original openpi code, upcast attention output suffix_out = suffix_out.to(dtype=torch.float32) v_t = self.action_out_proj(suffix_out) @@ -949,4 +1890,4 @@ class VLAFlowMatching(nn.Module): suffix_out = suffix_out[:, -self.config.chunk_size :] suffix_out = suffix_out.to(dtype=torch.float32) v_t = self.action_out_proj(suffix_out) - return v_t + return v_t \ No newline at end of file diff --git a/src/lerobot/policies/smolvla/modeling_smolvla_v2.py b/src/lerobot/policies/smolvla/modeling_smolvla_v2.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/lerobot/policies/smolvla/saver.txt b/src/lerobot/policies/smolvla/saver.txt new file mode 100644 index 000000000..3410062ba --- /dev/null +++ b/src/lerobot/policies/smolvla/saver.txt @@ -0,0 +1 @@ +c \ No newline at end of file diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py index f6a49dccf..e4cd7acac 100644 --- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py +++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py @@ -77,8 +77,8 @@ class SmolVLMWithExpertModel(nn.Module): self.vlm = AutoModelForImageTextToText.from_pretrained( model_id, device_map="auto", - # torch_dtype="bfloat16", - torch_dtype=torch.float16, + torch_dtype="bfloat16", + # torch_dtype=torch.float16, low_cpu_mem_usage=True, ) config = self.vlm.config @@ -547,4 +547,4 @@ class SmolVLMWithExpertModel(nn.Module): # we use -1 because sequence length can change att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim) - return att_output + return att_output \ No newline at end of file diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index 3145bed35..92a3bf833 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -458,6 +458,43 @@ def _compile_episode_data( data_dict["index"] = torch.arange(start_data_index, start_data_index + total_frames, 1) return data_dict +from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy +from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata +def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): + """Recreate normalization layers with proper stats from the dataset.""" + from lerobot.policies.normalize import Normalize, Unnormalize + + # Convert numpy stats to the format expected by normalization layers + stats = {} + for key, stat_dict in dataset_meta.stats.items(): + stats[key] = { + stat_type: torch.from_numpy(stat_array) if isinstance(stat_array, np.ndarray) else stat_array + for stat_type, stat_array in stat_dict.items() + } + + # Recreate normalization layers with proper stats + normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) + + normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) + + unnormalize_outputs = Unnormalize( + policy.config.output_features, policy.config.normalization_mapping, stats + ) + + # Replace the normalization layers on the policy + policy.normalize_inputs = normalize_inputs + policy.normalize_targets = normalize_targets + policy.unnormalize_outputs = unnormalize_outputs + + print("Normalization layers recreated with dataset stats.") + + +def load_smolvla(cfg, dataset_repo: str): + from lerobot.datasets.lerobot_dataset import LeRobotDataset + dataset = LeRobotDataset(dataset_repo, root='/raid/jade/.cache/huggingface/datasets/') + policy = make_policy(cfg=cfg, ds_meta=dataset.meta) + _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta) # only needed if stats are missing + return policy, dataset @parser.wrap() @@ -466,7 +503,9 @@ def eval_main(cfg: EvalPipelineConfig): # Check device is available device = get_safe_torch_device(cfg.policy.device, log=True) - + #login to hf + from huggingface_hub import login + login() torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True set_seed(cfg.seed) @@ -481,6 +520,9 @@ def eval_main(cfg: EvalPipelineConfig): cfg=cfg.policy, env_cfg=cfg.env, ) + # breakpoint() + load_smolvla(cfg.policy, "physical-intelligence/libero") + # breakpoint() policy.eval() with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(): if cfg.env.multitask_eval: diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py index d5d562518..74219fc38 100644 --- a/src/lerobot/scripts/train.py +++ b/src/lerobot/scripts/train.py @@ -104,6 +104,32 @@ def update_policy( train_metrics.update_s = time.perf_counter() - start_time return train_metrics, output_dict +# def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): +# """Recreate normalization layers with dataset stats if missing (Adil's workaround).""" +# from lerobot.policies.normalize import Normalize, Unnormalize + +# if not hasattr(dataset_meta, "stats") or not dataset_meta.stats: +# print("āš ļø Dataset has no stats, skipping normalization injection.") +# return + +# stats = {} +# for key, stat_dict in dataset_meta.stats.items(): +# stats[key] = { +# stat_type: torch.as_tensor(stat_array) +# if isinstance(stat_array, np.ndarray) +# else stat_array +# for stat_type, stat_array in stat_dict.items() +# } + +# normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) +# normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) +# unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats) + +# policy.normalize_inputs = normalize_inputs +# policy.normalize_targets = normalize_targets +# policy.unnormalize_outputs = unnormalize_outputs + +# print("āœ… Normalization layers injected with dataset stats.") @parser.wrap() def train(cfg: TrainPipelineConfig): @@ -126,7 +152,6 @@ def train(cfg: TrainPipelineConfig): logging.info("Creating dataset") dataset = make_dataset(cfg) - # Create environment used for evaluating checkpoints during training on simulation data. # On real-world data, no need to create an environment as evaluations are done outside train.py, # using the eval.py instead, with gym_dora environment and dora-rs. @@ -140,7 +165,6 @@ def train(cfg: TrainPipelineConfig): cfg=cfg.policy, ds_meta=dataset.meta, ) - logging.info("Creating optimizer and scheduler") optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp) @@ -203,7 +227,6 @@ def train(cfg: TrainPipelineConfig): start_time = time.perf_counter() batch = next(dl_iter) train_tracker.dataloading_s = time.perf_counter() - start_time - for key in batch: if isinstance(batch[key], torch.Tensor): batch[key] = batch[key].to(device, non_blocking=device.type == "cuda") diff --git a/src/lerobot/scripts/train_2.py b/src/lerobot/scripts/train_2.py new file mode 100644 index 000000000..26a9e7aea --- /dev/null +++ b/src/lerobot/scripts/train_2.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import time +from contextlib import nullcontext +from pprint import pformat +from typing import Any + +import torch +from termcolor import colored +from torch.amp import GradScaler +from torch.optim import Optimizer + +from lerobot.configs import parser +from lerobot.configs.train import TrainPipelineConfig +from lerobot.datasets.factory import make_dataset +from lerobot.datasets.sampler import EpisodeAwareSampler +from lerobot.datasets.utils import cycle +from lerobot.envs.factory import make_env +from lerobot.optim.factory import make_optimizer_and_scheduler +from lerobot.policies.factory import make_policy +from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.policies.utils import get_device_from_parameters +from lerobot.scripts.eval import eval_policy, eval_policy_multitask +from lerobot.utils.logging_utils import AverageMeter, MetricsTracker +from lerobot.utils.random_utils import set_seed +from lerobot.utils.train_utils import ( + get_step_checkpoint_dir, + get_step_identifier, + load_training_state, + save_checkpoint, + update_last_checkpoint, +) +from lerobot.utils.utils import ( + format_big_number, + get_safe_torch_device, + has_method, + init_logging, +) +from lerobot.utils.wandb_utils import WandBLogger + + +def update_policy( + train_metrics: MetricsTracker, + policy: PreTrainedPolicy, + batch: Any, + optimizer: Optimizer, + grad_clip_norm: float, + grad_scaler: GradScaler, + lr_scheduler=None, + use_amp: bool = False, + lock=None, +) -> tuple[MetricsTracker, dict]: + start_time = time.perf_counter() + device = get_device_from_parameters(policy) + policy.train() + with torch.autocast(device_type=device.type) if use_amp else nullcontext(): + loss, output_dict = policy.forward(batch) + # TODO(rcadene): policy.unnormalize_outputs(out_dict) + grad_scaler.scale(loss).backward() + + # Unscale the gradient of the optimizer's assigned params in-place **prior to gradient clipping**. + grad_scaler.unscale_(optimizer) + + grad_norm = torch.nn.utils.clip_grad_norm_( + policy.parameters(), + grad_clip_norm, + error_if_nonfinite=False, + ) + + # Optimizer's gradients are already unscaled, so scaler.step does not unscale them, + # although it still skips optimizer.step() if the gradients contain infs or NaNs. + with lock if lock is not None else nullcontext(): + grad_scaler.step(optimizer) + # Updates the scale for next iteration. + grad_scaler.update() + + optimizer.zero_grad() + + # Step through pytorch scheduler at every batch instead of epoch + if lr_scheduler is not None: + lr_scheduler.step() + + if has_method(policy, "update"): + # To possibly update an internal buffer (for instance an Exponential Moving Average like in TDMPC). + policy.update() + + train_metrics.loss = loss.item() + train_metrics.grad_norm = grad_norm.item() + train_metrics.lr = optimizer.param_groups[0]["lr"] + train_metrics.update_s = time.perf_counter() - start_time + return train_metrics, output_dict + +def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): + """Recreate normalization layers with dataset stats if missing (Adil's workaround).""" + from lerobot.policies.normalize import Normalize, Unnormalize + + if not hasattr(dataset_meta, "stats") or not dataset_meta.stats: + print("āš ļø Dataset has no stats, skipping normalization injection.") + return + + stats = {} + for key, stat_dict in dataset_meta.stats.items(): + stats[key] = { + stat_type: torch.as_tensor(stat_array) + if isinstance(stat_array, np.ndarray) + else stat_array + for stat_type, stat_array in stat_dict.items() + } + + normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) + normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) + unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats) + + policy.normalize_inputs = normalize_inputs + policy.normalize_targets = normalize_targets + policy.unnormalize_outputs = unnormalize_outputs + + print("āœ… Normalization layers injected with dataset stats.") + +@parser.wrap() +def train(cfg: TrainPipelineConfig): + cfg.validate() + logging.info(pformat(cfg.to_dict())) + + if cfg.wandb.enable and cfg.wandb.project: + wandb_logger = WandBLogger(cfg) + else: + wandb_logger = None + logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"])) + + if cfg.seed is not None: + set_seed(cfg.seed) + + # Check device is available + device = get_safe_torch_device(cfg.policy.device, log=True) + torch.backends.cudnn.benchmark = True + torch.backends.cuda.matmul.allow_tf32 = True + + logging.info("Creating dataset") + dataset = make_dataset(cfg) + + # Create environment used for evaluating checkpoints during training on simulation data. + # On real-world data, no need to create an environment as evaluations are done outside train.py, + # using the eval.py instead, with gym_dora environment and dora-rs. + eval_env = None + if cfg.eval_freq > 0 and cfg.env is not None: + logging.info("Creating env") + eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) + + logging.info("Creating policy") + policy = make_policy( + cfg=cfg.policy, + ds_meta=dataset.meta, + ) + logging.info("Creating optimizer and scheduler") + optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) + grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp) + + step = 0 # number of policy updates (forward + backward + optim) + + if cfg.resume: + step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler) + + num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad) + num_total_params = sum(p.numel() for p in policy.parameters()) + + logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}") + if cfg.env is not None: + logging.info(f"{cfg.env.task=}") + logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})") + logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})") + logging.info(f"{dataset.num_episodes=}") + logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") + logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") + + # create dataloader for offline training + if hasattr(cfg.policy, "drop_n_last_frames"): + shuffle = False + sampler = EpisodeAwareSampler( + dataset.episode_data_index, + drop_n_last_frames=cfg.policy.drop_n_last_frames, + shuffle=True, + ) + else: + shuffle = True + sampler = None + + dataloader = torch.utils.data.DataLoader( + dataset, + num_workers=cfg.num_workers, + batch_size=cfg.batch_size, + shuffle=shuffle, + sampler=sampler, + pin_memory=device.type == "cuda", + drop_last=False, + ) + dl_iter = cycle(dataloader) + + policy.train() + train_metrics = { + "loss": AverageMeter("loss", ":.3f"), + "grad_norm": AverageMeter("grdn", ":.3f"), + "lr": AverageMeter("lr", ":0.1e"), + "update_s": AverageMeter("updt_s", ":.3f"), + "dataloading_s": AverageMeter("data_s", ":.3f"), + } + + train_tracker = MetricsTracker( + cfg.batch_size, dataset.num_frames, dataset.num_episodes, train_metrics, initial_step=step + ) + + logging.info("Start offline training on a fixed dataset") + for _ in range(step, cfg.steps): + start_time = time.perf_counter() + batch = next(dl_iter) + train_tracker.dataloading_s = time.perf_counter() - start_time + + for key in batch: + if isinstance(batch[key], torch.Tensor): + batch[key] = batch[key].to(device, non_blocking=device.type == "cuda") + + train_tracker, output_dict = update_policy( + train_tracker, + policy, + batch, + optimizer, + cfg.optimizer.grad_clip_norm, + grad_scaler=grad_scaler, + lr_scheduler=lr_scheduler, + use_amp=cfg.policy.use_amp, + ) + + # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we + # increment `step` here. + step += 1 + train_tracker.step() + is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 + is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps + is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0 + + if is_log_step: + logging.info(train_tracker) + if wandb_logger: + wandb_log_dict = train_tracker.to_dict() + if output_dict: + wandb_log_dict.update(output_dict) + wandb_logger.log_dict(wandb_log_dict, step) + train_tracker.reset_averages() + + if cfg.save_checkpoint and is_saving_step: + logging.info(f"Checkpoint policy after step {step}") + checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step) + save_checkpoint(checkpoint_dir, step, cfg, policy, optimizer, lr_scheduler) + update_last_checkpoint(checkpoint_dir) + if wandb_logger: + wandb_logger.log_policy(checkpoint_dir) + + if cfg.env and is_eval_step: + step_id = get_step_identifier(step, cfg.steps) + logging.info(f"Eval policy at step {step}") + with ( + torch.no_grad(), + torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(), + ): + if cfg.env.multitask_eval: + eval_info = eval_policy_multitask( + eval_env, + policy, + cfg.eval.n_episodes, + videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", + max_episodes_rendered=4, + start_seed=cfg.seed, + max_parallel_tasks=cfg.env.max_parallel_tasks, + ) + aggregated = eval_info["overall"]["aggregated"] + # Print per-suite stats, log? + for task_group, task_group_info in eval_info.items(): + if task_group == "overall": + continue # Skip the overall stats since we already printed it + print(f"\nAggregated Metrics for {task_group}:") + print(task_group_info["aggregated"]) + else: + eval_info = eval_policy( + eval_env, + policy, + cfg.eval.n_episodes, + videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", + max_episodes_rendered=4, + start_seed=cfg.seed, + ) + aggregated = eval_info["aggregated"] + + eval_metrics = { + "avg_sum_reward": AverageMeter("āˆ‘rwrd", ":.3f"), + "pc_success": AverageMeter("success", ":.1f"), + "eval_s": AverageMeter("eval_s", ":.3f"), + } + eval_tracker = MetricsTracker( + cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step + ) + eval_tracker.eval_s = aggregated.pop("eval_s") + eval_tracker.avg_sum_reward = aggregated.pop("avg_sum_reward") + eval_tracker.pc_success = aggregated.pop("pc_success") + logging.info(eval_tracker) + if wandb_logger: + wandb_log_dict = {**eval_tracker.to_dict(), **eval_info} + wandb_logger.log_dict(wandb_log_dict, step, mode="eval") + wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval") + + if eval_env: + if cfg.env.multitask_eval: + for _task_group, envs_dict in eval_env.items(): + for _idx, env in envs_dict.items(): + env.close() + else: + eval_env.close() + logging.info("End of training") + + if cfg.policy.push_to_hub: + policy.push_model_to_hub(cfg) + + +def main(): + init_logging() + train() + + +if __name__ == "__main__": + main() diff --git a/src/lerobot/scripts/train_accelerate.py b/src/lerobot/scripts/train_accelerate.py new file mode 100644 index 000000000..e205f138f --- /dev/null +++ b/src/lerobot/scripts/train_accelerate.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import time +from pprint import pformat +from typing import Any + +import torch +from accelerate import Accelerator +from accelerate.utils import set_seed as accelerate_set_seed +from termcolor import colored +from torch.optim import Optimizer + +from lerobot.datasets.factory import make_dataset +from lerobot.datasets.sampler import EpisodeAwareSampler +from lerobot.envs.factory import make_env +from lerobot.optim.factory import make_optimizer_and_scheduler +from lerobot.policies.factory import make_policy +from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.utils.logging_utils import AverageMeter, MetricsTracker +from lerobot.utils.train_utils import ( + get_step_checkpoint_dir, + get_step_identifier, + load_training_state, + save_checkpoint, + update_last_checkpoint, +) +from lerobot.utils.utils import ( + format_big_number, + has_method, + init_logging, +) +from lerobot.configs import parser +from lerobot.configs.train import TrainPipelineConfig +from lerobot.scripts.eval import eval_policy + + +def update_policy( + train_metrics: MetricsTracker, + policy: PreTrainedPolicy, + batch: Any, + optimizer: Optimizer, + grad_clip_norm: float, + accelerator: Accelerator, + lr_scheduler=None, +) -> tuple[MetricsTracker, dict]: + start_time = time.perf_counter() + policy.train() + + # Use accelerator's autocast context if mixed precision is enabled + with accelerator.autocast(): + loss, output_dict = policy.forward(batch) + # TODO(rcadene): policy.unnormalize_outputs(out_dict) + + # Use accelerator for backward pass + accelerator.backward(loss) + + # Gradient clipping - accelerator handles unscaling automatically + if accelerator.sync_gradients and grad_clip_norm > 0: + grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm) + else: + grad_norm = torch.tensor(0.0) + + optimizer.step() + lr_scheduler.step() if lr_scheduler is not None else None + optimizer.zero_grad() + + # Update policy-specific buffers if needed + if has_method(policy, "update"): + policy.update() + + # Gather metrics across all processes + loss_value = accelerator.gather(loss.detach()).mean().item() + grad_norm_value = accelerator.gather(grad_norm).mean().item() + + train_metrics.loss = loss_value + train_metrics.grad_norm = grad_norm_value + train_metrics.lr = optimizer.param_groups[0]["lr"] + train_metrics.update_s = time.perf_counter() - start_time + return train_metrics, output_dict + + +@parser.wrap() +def train(cfg: TrainPipelineConfig): + cfg.validate() + logging.info(pformat(cfg.to_dict())) + + # Initialize accelerator + from accelerate.utils import DistributedDataParallelKwargs + # added by jade 2 lines + ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False) + accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs]) + + from lerobot.utils.wandb_utils import cfg_to_group, get_wandb_run_id_from_filesystem + + ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator( + mixed_precision="bf16" if cfg.policy.use_amp else "no", + gradient_accumulation_steps=cfg.policy.gradient_accumulation_steps, + log_with="wandb" if cfg.wandb.enable else None, + kwargs_handlers=[ddp_kwargs], + project_dir=cfg.output_dir, + ) + + accelerator.init_trackers( + project_name=cfg.wandb.project, + init_kwargs={ + "wandb": { + "entity": cfg.wandb.entity, + "name": cfg.job_name, + "notes": cfg.wandb.notes, + "tags": cfg_to_group(cfg, return_list=True), + "dir": cfg.output_dir, + "config": cfg.to_dict(), + "save_code": False, + "job_type": "train_eval", + "mode": cfg.wandb.mode if cfg.wandb.mode in ["online", "offline", "disabled"] else "online", + "resume": "must" if cfg.resume else None, + "id": cfg.wandb.run_id + if cfg.wandb.run_id + else (get_wandb_run_id_from_filesystem(cfg.output_dir) if cfg.resume else None), + } + }, + ) + + # Set seed for reproducibility + if cfg.seed is not None: + accelerate_set_seed(cfg.seed) + + # Setup device - accelerator handles device placement + torch.backends.cudnn.benchmark = True + torch.backends.cuda.matmul.allow_tf32 = True + + # Create dataset + if accelerator.is_main_process: + logging.info("Creating dataset") + dataset = make_dataset(cfg) + print("c") + # Create evaluation environment (only on main process) + eval_env = None + if cfg.eval_freq > 0 and cfg.env is not None and accelerator.is_main_process: + logging.info("Creating env") + eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) + + # Create policy + if accelerator.is_main_process: + logging.info("Creating policy") + + # Use accelerator's device instead of cfg.policy.device + with accelerator.main_process_first(): + policy = make_policy( + cfg=cfg.policy, + ds_meta=dataset.meta, + ) + + # Create optimizer and scheduler + if accelerator.is_main_process: + logging.info("Creating optimizer and scheduler") + optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) + + step = 0 # number of policy updates + + if cfg.resume: + step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler) + + # Prepare dataloader + if hasattr(cfg.policy, "drop_n_last_frames"): + shuffle = False + sampler = EpisodeAwareSampler( + dataset.episode_data_index, + drop_n_last_frames=cfg.policy.drop_n_last_frames, + shuffle=True, + ) + else: + shuffle = True + sampler = None + + dataloader = torch.utils.data.DataLoader( + dataset, + num_workers=cfg.num_workers, + batch_size=cfg.batch_size, + shuffle=shuffle, + sampler=sampler, + pin_memory=True, + drop_last=True, # Important for distributed training + ) + + # Prepare for distributed training + policy, optimizer, dataloader, lr_scheduler = accelerator.prepare( + policy, optimizer, dataloader, lr_scheduler + ) + + # Log training info (only on main process) + if accelerator.is_main_process: + num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad) + num_total_params = sum(p.numel() for p in policy.parameters()) + + logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}") + if cfg.env is not None: + logging.info(f"{cfg.env.task=}") + logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})") + logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})") + logging.info(f"{dataset.num_episodes=}") + logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") + logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") + logging.info(f"Number of processes: {accelerator.num_processes}") + logging.info(f"Device: {accelerator.device}") + logging.info(f"Mixed precision: {accelerator.mixed_precision}") + + # Create metrics trackers + train_metrics = { + "loss": AverageMeter("loss", ":.3f"), + "grad_norm": AverageMeter("grdn", ":.3f"), + "lr": AverageMeter("lr", ":0.1e"), + "update_s": AverageMeter("updt_s", ":.3f"), + "dataloading_s": AverageMeter("data_s", ":.3f"), + } + + train_tracker = MetricsTracker( + cfg.batch_size * accelerator.num_processes, # Account for all processes + dataset.num_frames, + dataset.num_episodes, + train_metrics, + initial_step=step, + ) + + # Training loop + policy.train() + if accelerator.is_main_process: + logging.info("Start offline training on a fixed dataset") + + # Create iterator from dataloader + dl_iter = iter(dataloader) + + for current_step in range(step, cfg.steps): + start_time = time.perf_counter() + # Get next batch, cycling through dataloader if needed + try: + batch = next(dl_iter) + print("data laoder batch keys: ", batch.keys()) + breakpoint() + except StopIteration: + dl_iter = iter(dataloader) + batch = next(dl_iter) + train_tracker.dataloading_s = time.perf_counter() - start_time + # Update policy + train_tracker, output_dict = update_policy( + train_tracker, + policy, + batch, + optimizer, + cfg.optimizer.grad_clip_norm, + accelerator, + lr_scheduler=lr_scheduler, + ) + + # Increment step counter + step += 1 + train_tracker.step() + + # Determine if we should log, save, or evaluate + is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 + is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps + is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0 + + # Logging (only on main process) + if is_log_step and accelerator.is_main_process: + logging.info(train_tracker) + wandb_log_dict = train_tracker.to_dict() + if output_dict: + wandb_log_dict.update(output_dict) + for k, v in wandb_log_dict.items(): + accelerator.log({f"{'train'}/{k}": v}, step=step) + train_tracker.reset_averages() + + # Checkpointing (only on main process) + if cfg.save_checkpoint and is_saving_step: + # āœ… all processes wait here + accelerator.wait_for_everyone() + + if accelerator.is_main_process: + logging.info(f"Checkpoint policy after step {step}") + checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step) + + unwrapped_policy = accelerator.unwrap_model(policy) + save_checkpoint(checkpoint_dir, step, cfg, unwrapped_policy, optimizer, lr_scheduler) + update_last_checkpoint(checkpoint_dir) + + # āœ… all processes sync again after saving + accelerator.wait_for_everyone() + + # if wandb_logger: + # wandb_logger.log_policy(checkpoint_dir) + + # Evaluation (only on main process) + if cfg.env and is_eval_step and accelerator.is_main_process: + step_id = get_step_identifier(step, cfg.steps) + logging.info(f"Eval policy at step {step}") + + # Unwrap model for evaluation + unwrapped_policy = accelerator.unwrap_model(policy) + unwrapped_policy.eval() + + with torch.no_grad(): + eval_info = eval_policy( + eval_env, + unwrapped_policy, + cfg.eval.n_episodes, + videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", + max_episodes_rendered=4, + start_seed=cfg.seed, + ) + + eval_metrics = { + "avg_sum_reward": AverageMeter("āˆ‘rwrd", ":.3f"), + "pc_success": AverageMeter("success", ":.1f"), + "eval_s": AverageMeter("eval_s", ":.3f"), + } + eval_tracker = MetricsTracker( + cfg.batch_size * accelerator.num_processes, + dataset.num_frames, + dataset.num_episodes, + eval_metrics, + initial_step=step, + ) + eval_tracker.eval_s = eval_info["aggregated"].pop("eval_s") + eval_tracker.avg_sum_reward = eval_info["aggregated"].pop("avg_sum_reward") + eval_tracker.pc_success = eval_info["aggregated"].pop("pc_success") + logging.info(eval_tracker) + + wandb_log_dict = {**eval_tracker.to_dict(), **eval_info} + for k, v in wandb_log_dict.items(): + accelerator.log({f"{'eval'}/{k}": v}, step=step) + + # Set back to training mode + policy.train() + + # Wait for all processes to finish + accelerator.wait_for_everyone() + + # Cleanup + if eval_env and accelerator.is_main_process: + eval_env.close() + + if accelerator.is_main_process: + logging.info("End of training") + accelerator.end_training() # added by jade + + +if __name__ == "__main__": + init_logging() + train() diff --git a/tmux_log.txt b/tmux_log.txt new file mode 100644 index 000000000..4936578bc --- /dev/null +++ b/tmux_log.txt @@ -0,0 +1,2008 @@ + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-08 13:23:15 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-08 13:23:15 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-08 13:23:15 ccelerate.py:99 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'HuggingFaceVLA/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.75, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': -1, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of + 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher +. +WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of + 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher +. +INFO 2025-09-08 13:23:15 celerate.py:149 Creating dataset +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 35414.48it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 69/69 [00:00<00:00, 5660.00it/s] +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 43760.67it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 69/69 [00:00<00:00, 5629.72it/s] +c +INFO 2025-09-08 13:23:22 celerate.py:160 Creating policy +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +c +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +[rank1]:[W908 13:23:22.785516795 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used b +y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You + can pecify device_id in init_process_group() to force use of a particular device. +Reducing the number of VLM layers to 16 ... +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +[rank0]:[W908 13:23:43.028071493 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used b +y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You + can pecify device_id in init_process_group() to force use of a particular device. +INFO 2025-09-08 13:23:43 celerate.py:171 Creating optimizer and scheduler +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +Reducing the number of VLM layers to 16 ... +INFO 2025-09-08 13:24:04 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla +_lr1e-4bs32steps100000 +INFO 2025-09-08 13:24:04 celerate.py:213 cfg.env.task='libero_spatial' +INFO 2025-09-08 13:24:04 celerate.py:214 cfg.steps=100000 (100K) +INFO 2025-09-08 13:24:04 celerate.py:215 dataset.num_frames=273465 (273K) +INFO 2025-09-08 13:24:04 celerate.py:216 dataset.num_episodes=1693 +INFO 2025-09-08 13:24:04 celerate.py:217 num_learnable_params=99880992 (100M) +INFO 2025-09-08 13:24:04 celerate.py:218 num_total_params=450046220 (450M) +INFO 2025-09-08 13:24:04 celerate.py:219 Number of processes: 2 +INFO 2025-09-08 13:24:04 celerate.py:220 Device: cuda:0 +INFO 2025-09-08 13:24:04 celerate.py:221 Mixed precision: bf16 +INFO 2025-09-08 13:24:04 celerate.py:243 Start offline training on a fixed dataset + +bach: dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp +', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima +ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task']) +> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train() +-> train_tracker, output_dict = update_policy( +(Pdb) +bach: dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp +', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima +ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task']) +> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train() +-> train_tracker, output_dict = update_policy( +(Pdb) batch.keys()[rank0]:[W908 13:24:43.868440913 reducer.cpp:1430] Warning: find_unused_parameters=True was specif +ied in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra tr +aversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never h +as any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false +positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +policy.config.input_features +*** SyntaxError: invalid syntax +(Pdb) policy.config.input_features +*** AttributeError: 'DistributedDataParallel' object has no attribute 'config' +(Pdb) policy +DistributedDataParallel( + (module): SmolVLAPolicy( + (normalize_inputs): Normalize( + (buffer_observation_state): ParameterDict( + (mean): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)] + (std): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)] + ) + ) + (normalize_targets): Normalize( + (buffer_action): ParameterDict( + (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] + (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] + ) + ) + (unnormalize_outputs): Unnormalize( + (buffer_action): ParameterDict( + (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] + (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] + ) + ) + (model): VLAFlowMatching( + (vlm_with_expert): SmolVLMWithExpertModel( + (vlm): SmolVLMForConditionalGeneration( + (model): SmolVLMModel( + (vision_model): SmolVLMVisionTransformer( + (embeddings): SmolVLMVisionEmbeddings( + (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid) + (position_embedding): Embedding(1024, 768) + ) + (encoder): SmolVLMEncoder( + (layers): ModuleList( + (0-11): 12 x SmolVLMEncoderLayer( + (self_attn): SmolVLMVisionAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): SmolVLMVisionMLP( + (activation_fn): PytorchGELUTanh() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + ) + ) + ) + (post_layernorm): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + ) + (connector): SmolVLMConnector( + (modality_projection): SmolVLMSimpleMLP( + (proj): Linear(in_features=12288, out_features=960, bias=False) + ) + ) + (text_model): LlamaModel( + (embed_tokens): Embedding(49280, 960, padding_idx=2) + (layers): ModuleList( + (0-15): 16 x LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=960, out_features=960, bias=False) + (k_proj): Linear(in_features=960, out_features=320, bias=False) + (v_proj): Linear(in_features=960, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=960, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=960, out_features=2560, bias=False) + (up_proj): Linear(in_features=960, out_features=2560, bias=False) + (down_proj): Linear(in_features=2560, out_features=960, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((960,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05) + ) + ) + (norm): LlamaRMSNorm((960,), eps=1e-05) + (rotary_emb): LlamaRotaryEmbedding() + ) + ) + (lm_head): Linear(in_features=960, out_features=49280, bias=False) + ) + (lm_expert): LlamaModel( + (embed_tokens): None + (layers): ModuleList( + (0): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (1): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (2): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (3): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (4): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (5): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (6): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (7): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (8): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (9): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (10): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (11): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (12): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (13): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (14): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (15): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + ) + (norm): LlamaRMSNorm((720,), eps=1e-05) + (rotary_emb): LlamaRotaryEmbedding() + ) + ) + (state_proj): Linear(in_features=32, out_features=960, bias=True) + (action_in_proj): Linear(in_features=32, out_features=720, bias=True) + (action_out_proj): Linear(in_features=720, out_features=32, bias=True) + (action_time_mlp_in): Linear(in_features=1440, out_features=720, bias=True) + (action_time_mlp_out): Linear(in_features=720, out_features=720, bias=True) + ) + ) +) +(Pdb) policy.config +*** AttributeError: 'DistributedDataParallel' object has no attribute 'config' +(Pdb) policy.input_features +*** AttributeError: 'DistributedDataParallel' object has no attribute 'input_features' +(Pdb) quit() +[rank1]: Traceback (most recent call last): +[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 368, in +[rank1]: train() +[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner +[rank1]: response = fn(cfg, *args, **kwargs) +[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train +[rank1]: train_tracker, output_dict = update_policy( +[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train +[rank1]: train_tracker, output_dict = update_policy( +[rank1]: File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch +[rank1]: return self.dispatch_line(frame) +[rank1]: File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line +[rank1]: if self.quitting: raise BdbQuit +[rank1]: bdb.BdbQuit +W0908 13:25:34.274000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 776 +663 closing signal SIGTERM +E0908 13:25:34.589000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1 +) local_rank: 1 (pid: 776664) of binary: /home/jade_choghari/miniconda3/envs/lerobot/bin/python +Traceback (most recent call last): + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin +e 1245, in + main() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin +e 1241, in main + launch_command(args) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin +e 1226, in launch_command + multi_gpu_launcher(args) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin +e 853, in multi_gpu_launcher + distrib_run.run(args) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/run.py", line 883 +, in run + elastic_launch( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py", + line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py", + line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +src/lerobot/scripts/train_accelerate.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-09-08_13:25:34 + host : hf-dgx-01 + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 776664) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/7_train_acc.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000 +The following values were not passed to `accelerate launch` and had defaults used instead: + More than one GPU was found, enabling multi-GPU training. + If this was unintended please pass in `--num_processes=1`. + `--dynamo_backend` was set to a value of `'no'` +To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`. +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/utils/launch.py:238: UserWarning +: Port `29522` is already in use. Accelerate will attempt to launch in a standalone-like mode by finding an open por +t automatically for this session. If this current attempt fails, or for more control in future runs, please specify +a different port (e.g., `--main_process_port `) or use `--main_process_port 0` for automatic selec +tion in your launch command or Accelerate config file. + warnings.warn( +INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'HuggingFaceVLA/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.75, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': -1, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'HuggingFaceVLA/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.75, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': -1, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of + 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher +. +WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of + 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher +. +INFO 2025-09-08 13:33:47 celerate.py:149 Creating dataset +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 103295.66it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 69/69 [00:00<00:00, 5229.81it/s] +Resolving data files: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1693/1693 [00:00<00:00, 360601.09it/s] +Loading dataset shards: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 69/69 [00:00<00:00, 4881.54it/s] +c +INFO 2025-09-08 13:33:53 celerate.py:160 Creating policy +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +c +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +[rank1]:[W908 13:33:54.613597516 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used b +y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You + can pecify device_id in init_process_group() to force use of a particular device. +Reducing the number of VLM layers to 16 ... +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +[rank0]:[W908 13:34:15.806448425 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used b +y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You + can pecify device_id in init_process_group() to force use of a particular device. +INFO 2025-09-08 13:34:15 celerate.py:171 Creating optimizer and scheduler +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +Reducing the number of VLM layers to 16 ... +INFO 2025-09-08 13:34:36 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla +_lr1e-4bs32steps100000 +INFO 2025-09-08 13:34:36 celerate.py:213 cfg.env.task='libero_spatial' +INFO 2025-09-08 13:34:36 celerate.py:214 cfg.steps=100000 (100K) +INFO 2025-09-08 13:34:36 celerate.py:215 dataset.num_frames=273465 (273K) +INFO 2025-09-08 13:34:36 celerate.py:216 dataset.num_episodes=1693 +INFO 2025-09-08 13:34:36 celerate.py:217 num_learnable_params=99880992 (100M) +INFO 2025-09-08 13:34:36 celerate.py:218 num_total_params=450046220 (450M) +INFO 2025-09-08 13:34:36 celerate.py:219 Number of processes: 2 +INFO 2025-09-08 13:34:36 celerate.py:220 Device: cuda:0 +INFO 2025-09-08 13:34:36 celerate.py:221 Mixed precision: bf16 +INFO 2025-09-08 13:34:36 celerate.py:243 Start offline training on a fixed dataset +[rank1]:[W908 13:34:39.454560620 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru +ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut +ograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused para +meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m +odel has flow control causing later iterations to have unused parameters. (function operator()) +[rank0]:[W908 13:34:40.502702504 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru +ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut +ograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused para +meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m +odel has flow control causing later iterations to have unused parameters. (function operator()) +INFO 2025-09-08 13:36:23 celerate.py:281 step:200 smpl:13K ep:79 epch:0.05 loss:0.963 grdn:2.699 lr:2.0e-05 updt_s:0 +.506 data_s:0.027 +INFO 2025-09-08 13:38:09 celerate.py:281 step:400 smpl:26K ep:158 epch:0.09 loss:0.389 grdn:3.127 lr:6.0e-05 updt_s: +0.525 data_s:0.003 +INFO 2025-09-08 13:39:53 celerate.py:281 step:600 smpl:38K ep:238 epch:0.14 loss:0.261 grdn:2.618 lr:9.5e-05 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 13:41:37 celerate.py:281 step:800 smpl:51K ep:317 epch:0.19 loss:0.231 grdn:1.684 lr:9.9e-05 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 13:43:21 celerate.py:281 step:1K smpl:64K ep:396 epch:0.23 loss:0.211 grdn:1.258 lr:9.9e-05 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 13:45:05 celerate.py:281 step:1K smpl:77K ep:475 epch:0.28 loss:0.198 grdn:1.032 lr:9.9e-05 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 13:46:49 celerate.py:281 step:1K smpl:90K ep:555 epch:0.33 loss:0.182 grdn:0.880 lr:9.8e-05 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 13:48:33 celerate.py:281 step:2K smpl:102K ep:634 epch:0.37 loss:0.167 grdn:0.744 lr:9.8e-05 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 13:50:17 celerate.py:281 step:2K smpl:115K ep:713 epch:0.42 loss:0.157 grdn:0.680 lr:9.7e-05 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 13:52:01 celerate.py:281 step:2K smpl:128K ep:792 epch:0.47 loss:0.147 grdn:0.612 lr:9.6e-05 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 13:53:44 celerate.py:281 step:2K smpl:141K ep:872 epch:0.51 loss:0.142 grdn:0.576 lr:9.5e-05 updt_s: +0.510 data_s:0.003 +INFO 2025-09-08 13:55:27 celerate.py:281 step:2K smpl:154K ep:951 epch:0.56 loss:0.136 grdn:0.523 lr:9.4e-05 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 13:57:11 celerate.py:281 step:3K smpl:166K ep:1K epch:0.61 loss:0.132 grdn:0.509 lr:9.3e-05 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 13:58:57 celerate.py:281 step:3K smpl:179K ep:1K epch:0.66 loss:0.126 grdn:0.492 lr:9.2e-05 updt_s:0 +.525 data_s:0.003 +INFO 2025-09-08 14:00:43 celerate.py:281 step:3K smpl:192K ep:1K epch:0.70 loss:0.124 grdn:0.467 lr:9.1e-05 updt_s:0 +.525 data_s:0.003 +INFO 2025-09-08 14:02:26 celerate.py:281 step:3K smpl:205K ep:1K epch:0.75 loss:0.119 grdn:0.438 lr:9.0e-05 updt_s:0 +.508 data_s:0.003 +INFO 2025-09-08 14:04:27 celerate.py:281 step:3K smpl:218K ep:1K epch:0.80 loss:0.118 grdn:0.426 lr:8.9e-05 updt_s:0 +.564 data_s:0.039 +INFO 2025-09-08 14:06:10 celerate.py:281 step:4K smpl:230K ep:1K epch:0.84 loss:0.116 grdn:0.422 lr:8.7e-05 updt_s:0 +.511 data_s:0.004 +INFO 2025-09-08 14:07:55 celerate.py:281 step:4K smpl:243K ep:2K epch:0.89 loss:0.113 grdn:0.395 lr:8.6e-05 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 14:09:38 celerate.py:281 step:4K smpl:256K ep:2K epch:0.94 loss:0.111 grdn:0.401 lr:8.5e-05 updt_s:0 +.511 data_s:0.003 +INFO 2025-09-08 14:11:21 celerate.py:281 step:4K smpl:269K ep:2K epch:0.98 loss:0.110 grdn:0.380 lr:8.3e-05 updt_s:0 +.511 data_s:0.003 +INFO 2025-09-08 14:13:08 celerate.py:281 step:4K smpl:282K ep:2K epch:1.03 loss:0.109 grdn:0.381 lr:8.2e-05 updt_s:0 +.413 data_s:0.119 +INFO 2025-09-08 14:14:52 celerate.py:281 step:5K smpl:294K ep:2K epch:1.08 loss:0.107 grdn:0.387 lr:8.0e-05 updt_s:0 +.373 data_s:0.146 +INFO 2025-09-08 14:16:36 celerate.py:281 step:5K smpl:307K ep:2K epch:1.12 loss:0.107 grdn:0.366 lr:7.8e-05 updt_s:0 +.446 data_s:0.072 +INFO 2025-09-08 14:18:19 celerate.py:281 step:5K smpl:320K ep:2K epch:1.17 loss:0.105 grdn:0.347 lr:7.6e-05 updt_s:0 +.468 data_s:0.045 +INFO 2025-09-08 14:20:01 celerate.py:281 step:5K smpl:333K ep:2K epch:1.22 loss:0.103 grdn:0.350 lr:7.5e-05 updt_s:0 +.510 data_s:0.003 +INFO 2025-09-08 14:21:46 celerate.py:281 step:5K smpl:346K ep:2K epch:1.26 loss:0.101 grdn:0.336 lr:7.3e-05 updt_s:0 +.512 data_s:0.011 +INFO 2025-09-08 14:23:30 celerate.py:281 step:6K smpl:358K ep:2K epch:1.31 loss:0.102 grdn:0.345 lr:7.1e-05 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 14:25:15 celerate.py:281 step:6K smpl:371K ep:2K epch:1.36 loss:0.100 grdn:0.333 lr:6.9e-05 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 14:26:59 celerate.py:281 step:6K smpl:384K ep:2K epch:1.40 loss:0.100 grdn:0.328 lr:6.7e-05 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 14:28:43 celerate.py:281 step:6K smpl:397K ep:2K epch:1.45 loss:0.099 grdn:0.319 lr:6.5e-05 updt_s:0 +.512 data_s:0.003 +INFO 2025-09-08 14:30:26 celerate.py:281 step:6K smpl:410K ep:3K epch:1.50 loss:0.098 grdn:0.313 lr:6.3e-05 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 14:32:11 celerate.py:281 step:7K smpl:422K ep:3K epch:1.54 loss:0.097 grdn:0.319 lr:6.1e-05 updt_s:0 +.519 data_s:0.004 +INFO 2025-09-08 14:33:55 celerate.py:281 step:7K smpl:435K ep:3K epch:1.59 loss:0.097 grdn:0.312 lr:5.9e-05 updt_s:0 +.506 data_s:0.010 +INFO 2025-09-08 14:35:39 celerate.py:281 step:7K smpl:448K ep:3K epch:1.64 loss:0.097 grdn:0.307 lr:5.7e-05 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 14:37:23 celerate.py:281 step:7K smpl:461K ep:3K epch:1.69 loss:0.095 grdn:0.294 lr:5.5e-05 updt_s:0 +.518 data_s:0.003 +INFO 2025-09-08 14:39:07 celerate.py:281 step:7K smpl:474K ep:3K epch:1.73 loss:0.095 grdn:0.299 lr:5.3e-05 updt_s:0 +.507 data_s:0.007 +INFO 2025-09-08 14:40:52 celerate.py:281 step:8K smpl:486K ep:3K epch:1.78 loss:0.094 grdn:0.283 lr:5.1e-05 updt_s:0 +.523 data_s:0.003 +INFO 2025-09-08 14:42:36 celerate.py:281 step:8K smpl:499K ep:3K epch:1.83 loss:0.093 grdn:0.284 lr:4.9e-05 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 14:44:22 celerate.py:281 step:8K smpl:512K ep:3K epch:1.87 loss:0.092 grdn:0.284 lr:4.7e-05 updt_s:0 +.465 data_s:0.060 +INFO 2025-09-08 14:46:06 celerate.py:281 step:8K smpl:525K ep:3K epch:1.92 loss:0.093 grdn:0.292 lr:4.5e-05 updt_s:0 +.456 data_s:0.066 +INFO 2025-09-08 14:47:49 celerate.py:281 step:8K smpl:538K ep:3K epch:1.97 loss:0.093 grdn:0.290 lr:4.3e-05 updt_s:0 +.510 data_s:0.003 +INFO 2025-09-08 14:49:37 celerate.py:281 step:9K smpl:550K ep:3K epch:2.01 loss:0.092 grdn:0.283 lr:4.1e-05 updt_s:0 +.419 data_s:0.117 +INFO 2025-09-08 14:51:20 celerate.py:281 step:9K smpl:563K ep:3K epch:2.06 loss:0.092 grdn:0.275 lr:3.9e-05 updt_s:0 +.463 data_s:0.053 +INFO 2025-09-08 14:53:05 celerate.py:281 step:9K smpl:576K ep:4K epch:2.11 loss:0.090 grdn:0.272 lr:3.7e-05 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 14:54:49 celerate.py:281 step:9K smpl:589K ep:4K epch:2.15 loss:0.090 grdn:0.268 lr:3.5e-05 updt_s:0 +.506 data_s:0.013 +INFO 2025-09-08 14:56:32 celerate.py:281 step:9K smpl:602K ep:4K epch:2.20 loss:0.090 grdn:0.271 lr:3.3e-05 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 14:58:17 celerate.py:281 step:10K smpl:614K ep:4K epch:2.25 loss:0.090 grdn:0.268 lr:3.1e-05 updt_s: +0.520 data_s:0.003 +INFO 2025-09-08 15:00:02 celerate.py:281 step:10K smpl:627K ep:4K epch:2.29 loss:0.089 grdn:0.261 lr:3.0e-05 updt_s: +0.519 data_s:0.003 +INFO 2025-09-08 15:01:48 celerate.py:281 step:10K smpl:640K ep:4K epch:2.34 loss:0.090 grdn:0.271 lr:2.8e-05 updt_s: +0.526 data_s:0.003 +INFO 2025-09-08 15:03:33 celerate.py:281 step:10K smpl:653K ep:4K epch:2.39 loss:0.089 grdn:0.262 lr:2.6e-05 updt_s: +0.521 data_s:0.003 +INFO 2025-09-08 15:05:18 celerate.py:281 step:10K smpl:666K ep:4K epch:2.43 loss:0.090 grdn:0.264 lr:2.4e-05 updt_s: +0.519 data_s:0.003 +INFO 2025-09-08 15:07:32 celerate.py:281 step:11K smpl:678K ep:4K epch:2.48 loss:0.089 grdn:0.255 lr:2.3e-05 updt_s: +0.663 data_s:0.004 +INFO 2025-09-08 15:09:21 celerate.py:281 step:11K smpl:691K ep:4K epch:2.53 loss:0.090 grdn:0.263 lr:2.1e-05 updt_s: +0.514 data_s:0.030 +INFO 2025-09-08 15:11:06 celerate.py:281 step:11K smpl:704K ep:4K epch:2.57 loss:0.088 grdn:0.254 lr:1.9e-05 updt_s: +0.517 data_s:0.006 +INFO 2025-09-08 15:12:51 celerate.py:281 step:11K smpl:717K ep:4K epch:2.62 loss:0.088 grdn:0.252 lr:1.8e-05 updt_s: +0.517 data_s:0.005 +INFO 2025-09-08 15:14:38 celerate.py:281 step:11K smpl:730K ep:5K epch:2.67 loss:0.088 grdn:0.251 lr:1.6e-05 updt_s: +0.532 data_s:0.003 +INFO 2025-09-08 15:16:23 celerate.py:281 step:12K smpl:742K ep:5K epch:2.71 loss:0.088 grdn:0.253 lr:1.5e-05 updt_s: +0.520 data_s:0.003 +INFO 2025-09-08 15:18:08 celerate.py:281 step:12K smpl:755K ep:5K epch:2.76 loss:0.087 grdn:0.244 lr:1.4e-05 updt_s: +0.521 data_s:0.003 +INFO 2025-09-08 15:19:54 celerate.py:281 step:12K smpl:768K ep:5K epch:2.81 loss:0.088 grdn:0.247 lr:1.2e-05 updt_s: +0.524 data_s:0.003 +INFO 2025-09-08 15:21:39 celerate.py:281 step:12K smpl:781K ep:5K epch:2.86 loss:0.087 grdn:0.242 lr:1.1e-05 updt_s: +0.520 data_s:0.003 +INFO 2025-09-08 15:23:32 celerate.py:281 step:12K smpl:794K ep:5K epch:2.90 loss:0.088 grdn:0.243 lr:1.0e-05 updt_s: +0.560 data_s:0.003 +INFO 2025-09-08 15:25:48 celerate.py:281 step:13K smpl:806K ep:5K epch:2.95 loss:0.087 grdn:0.240 lr:9.0e-06 updt_s: +0.674 data_s:0.005 +INFO 2025-09-08 15:28:02 celerate.py:281 step:13K smpl:819K ep:5K epch:3.00 loss:0.088 grdn:0.245 lr:8.0e-06 updt_s: +0.662 data_s:0.004 +INFO 2025-09-08 15:31:06 celerate.py:281 step:13K smpl:832K ep:5K epch:3.04 loss:0.086 grdn:0.236 lr:7.1e-06 updt_s: +0.688 data_s:0.231 +INFO 2025-09-08 15:32:52 celerate.py:281 step:13K smpl:845K ep:5K epch:3.09 loss:0.087 grdn:0.231 lr:6.3e-06 updt_s: +0.521 data_s:0.003 +INFO 2025-09-08 15:35:46 celerate.py:281 step:13K smpl:858K ep:5K epch:3.14 loss:0.088 grdn:0.235 lr:5.6e-06 updt_s: +0.637 data_s:0.232 +INFO 2025-09-08 15:37:34 celerate.py:281 step:14K smpl:870K ep:5K epch:3.18 loss:0.087 grdn:0.238 lr:4.9e-06 updt_s: +0.514 data_s:0.025 +INFO 2025-09-08 15:39:18 celerate.py:281 step:14K smpl:883K ep:5K epch:3.23 loss:0.087 grdn:0.226 lr:4.3e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 15:41:02 celerate.py:281 step:14K smpl:896K ep:6K epch:3.28 loss:0.087 grdn:0.230 lr:3.8e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 15:42:45 celerate.py:281 step:14K smpl:909K ep:6K epch:3.32 loss:0.086 grdn:0.229 lr:3.4e-06 updt_s: +0.507 data_s:0.008 +INFO 2025-09-08 15:44:29 celerate.py:281 step:14K smpl:922K ep:6K epch:3.37 loss:0.087 grdn:0.229 lr:3.0e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 15:46:12 celerate.py:281 step:15K smpl:934K ep:6K epch:3.42 loss:0.087 grdn:0.228 lr:2.8e-06 updt_s: +0.502 data_s:0.011 +INFO 2025-09-08 15:47:56 celerate.py:281 step:15K smpl:947K ep:6K epch:3.46 loss:0.086 grdn:0.232 lr:2.6e-06 updt_s: +0.515 data_s:0.004 +INFO 2025-09-08 15:49:39 celerate.py:281 step:15K smpl:960K ep:6K epch:3.51 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s: +0.365 data_s:0.147 +INFO 2025-09-08 15:51:22 celerate.py:281 step:15K smpl:973K ep:6K epch:3.56 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s: +0.333 data_s:0.179 +INFO 2025-09-08 15:53:07 celerate.py:281 step:15K smpl:986K ep:6K epch:3.60 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s: +0.357 data_s:0.164 +INFO 2025-09-08 15:54:50 celerate.py:281 step:16K smpl:998K ep:6K epch:3.65 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s: +0.365 data_s:0.151 +INFO 2025-09-08 15:56:35 celerate.py:281 step:16K smpl:1M ep:6K epch:3.70 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0. +450 data_s:0.071 +INFO 2025-09-08 15:58:19 celerate.py:281 step:16K smpl:1M ep:6K epch:3.74 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0. +495 data_s:0.023 +INFO 2025-09-08 16:00:04 celerate.py:281 step:16K smpl:1M ep:6K epch:3.79 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0. +393 data_s:0.131 +INFO 2025-09-08 16:01:48 celerate.py:281 step:16K smpl:1M ep:6K epch:3.84 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0. +515 data_s:0.003 +INFO 2025-09-08 16:03:32 celerate.py:281 step:17K smpl:1M ep:7K epch:3.88 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. +518 data_s:0.003 +INFO 2025-09-08 16:05:17 celerate.py:281 step:17K smpl:1M ep:7K epch:3.93 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. +397 data_s:0.125 +INFO 2025-09-08 16:12:41 celerate.py:281 step:17K smpl:1M ep:7K epch:3.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:1. +496 data_s:0.719 +INFO 2025-09-08 16:14:28 celerate.py:281 step:17K smpl:1M ep:7K epch:4.03 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0. +413 data_s:0.124 +INFO 2025-09-08 16:16:13 celerate.py:281 step:17K smpl:1M ep:7K epch:4.07 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0. +469 data_s:0.050 +INFO 2025-09-08 16:17:57 celerate.py:281 step:18K smpl:1M ep:7K epch:4.12 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0. +375 data_s:0.145 +INFO 2025-09-08 16:19:42 celerate.py:281 step:18K smpl:1M ep:7K epch:4.17 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0. +333 data_s:0.189 +INFO 2025-09-08 16:21:26 celerate.py:281 step:18K smpl:1M ep:7K epch:4.21 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. +334 data_s:0.184 +INFO 2025-09-08 16:23:09 celerate.py:281 step:18K smpl:1M ep:7K epch:4.26 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0. +331 data_s:0.185 +INFO 2025-09-08 16:24:53 celerate.py:281 step:18K smpl:1M ep:7K epch:4.31 loss:0.088 grdn:0.236 lr:2.5e-06 updt_s:0. +333 data_s:0.182 +INFO 2025-09-08 16:26:38 celerate.py:281 step:19K smpl:1M ep:7K epch:4.35 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0. +337 data_s:0.188 +INFO 2025-09-08 16:28:22 celerate.py:281 step:19K smpl:1M ep:7K epch:4.40 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0. +420 data_s:0.099 +INFO 2025-09-08 16:30:06 celerate.py:281 step:19K smpl:1M ep:8K epch:4.45 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. +444 data_s:0.075 +INFO 2025-09-08 16:31:49 celerate.py:281 step:19K smpl:1M ep:8K epch:4.49 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0. +475 data_s:0.036 +INFO 2025-09-08 16:33:33 celerate.py:281 step:19K smpl:1M ep:8K epch:4.54 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0. +379 data_s:0.139 +INFO 2025-09-08 16:35:17 celerate.py:281 step:20K smpl:1M ep:8K epch:4.59 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0. +348 data_s:0.171 +INFO 2025-09-08 16:37:01 celerate.py:281 step:20K smpl:1M ep:8K epch:4.63 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. +332 data_s:0.185 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +INFO 2025-09-08 16:38:46 celerate.py:281 step:20K smpl:1M ep:8K epch:4.68 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0. +486 data_s:0.037 +INFO 2025-09-08 16:38:46 celerate.py:295 Checkpoint policy after step 20000 +INFO 2025-09-08 16:40:30 celerate.py:281 step:20K smpl:1M ep:8K epch:4.73 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. +509 data_s:0.003 +INFO 2025-09-08 16:42:16 celerate.py:281 step:20K smpl:1M ep:8K epch:4.77 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0. +527 data_s:0.003 +INFO 2025-09-08 16:44:01 celerate.py:281 step:21K smpl:1M ep:8K epch:4.82 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0. +519 data_s:0.003 +INFO 2025-09-08 16:45:45 celerate.py:281 step:21K smpl:1M ep:8K epch:4.87 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0. +504 data_s:0.013 +INFO 2025-09-08 16:47:29 celerate.py:281 step:21K smpl:1M ep:8K epch:4.91 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0. +509 data_s:0.011 +INFO 2025-09-08 16:49:19 celerate.py:281 step:21K smpl:1M ep:8K epch:4.96 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0. +544 data_s:0.003 +INFO 2025-09-08 16:51:04 celerate.py:281 step:21K smpl:1M ep:8K epch:5.01 loss:0.086 grdn:0.225 lr:2.5e-06 updt_s:0. +488 data_s:0.039 +INFO 2025-09-08 16:52:51 celerate.py:281 step:22K smpl:1M ep:9K epch:5.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. +430 data_s:0.099 +INFO 2025-09-08 16:54:36 celerate.py:281 step:22K smpl:1M ep:9K epch:5.10 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0. +521 data_s:0.003 +INFO 2025-09-08 16:56:23 celerate.py:281 step:22K smpl:1M ep:9K epch:5.15 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0. +521 data_s:0.014 +INFO 2025-09-08 16:58:09 celerate.py:281 step:22K smpl:1M ep:9K epch:5.20 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0. +525 data_s:0.003 +INFO 2025-09-08 17:00:04 celerate.py:281 step:22K smpl:1M ep:9K epch:5.24 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0. +568 data_s:0.003 +INFO 2025-09-08 17:02:00 celerate.py:281 step:23K smpl:1M ep:9K epch:5.29 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0. +575 data_s:0.003 +INFO 2025-09-08 17:03:49 celerate.py:281 step:23K smpl:1M ep:9K epch:5.34 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0. +513 data_s:0.030 +INFO 2025-09-08 17:05:39 celerate.py:281 step:23K smpl:1M ep:9K epch:5.38 loss:0.085 grdn:0.227 lr:2.5e-06 updt_s:0. +523 data_s:0.027 +INFO 2025-09-08 17:07:26 celerate.py:281 step:23K smpl:1M ep:9K epch:5.43 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0. +529 data_s:0.003 +INFO 2025-09-08 17:09:12 celerate.py:281 step:23K smpl:1M ep:9K epch:5.48 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. +526 data_s:0.003 +INFO 2025-09-08 17:10:55 celerate.py:281 step:24K smpl:2M ep:9K epch:5.52 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s:0. +443 data_s:0.072 +INFO 2025-09-08 17:12:40 celerate.py:281 step:24K smpl:2M ep:9K epch:5.57 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0. +518 data_s:0.004 +INFO 2025-09-08 17:14:25 celerate.py:281 step:24K smpl:2M ep:10K epch:5.62 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 17:16:11 celerate.py:281 step:24K smpl:2M ep:10K epch:5.66 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0 +.523 data_s:0.003 +INFO 2025-09-08 17:17:55 celerate.py:281 step:24K smpl:2M ep:10K epch:5.71 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0 +.515 data_s:0.005 +INFO 2025-09-08 17:19:39 celerate.py:281 step:25K smpl:2M ep:10K epch:5.76 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0 +.415 data_s:0.106 +INFO 2025-09-08 17:21:24 celerate.py:281 step:25K smpl:2M ep:10K epch:5.80 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.507 data_s:0.016 +INFO 2025-09-08 17:23:08 celerate.py:281 step:25K smpl:2M ep:10K epch:5.85 loss:0.085 grdn:0.229 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 17:24:54 celerate.py:281 step:25K smpl:2M ep:10K epch:5.90 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0 +.518 data_s:0.008 +INFO 2025-09-08 17:26:41 celerate.py:281 step:25K smpl:2M ep:10K epch:5.94 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.529 data_s:0.003 +INFO 2025-09-08 17:28:24 celerate.py:281 step:26K smpl:2M ep:10K epch:5.99 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 17:30:11 celerate.py:281 step:26K smpl:2M ep:10K epch:6.04 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0 +.370 data_s:0.164 +INFO 2025-09-08 17:31:55 celerate.py:281 step:26K smpl:2M ep:10K epch:6.08 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 +.385 data_s:0.132 +INFO 2025-09-08 17:33:39 celerate.py:281 step:26K smpl:2M ep:10K epch:6.13 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.450 data_s:0.069 +INFO 2025-09-08 17:35:24 celerate.py:281 step:26K smpl:2M ep:10K epch:6.18 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0 +.468 data_s:0.052 +INFO 2025-09-08 17:37:07 celerate.py:281 step:27K smpl:2M ep:11K epch:6.23 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0 +.514 data_s:0.004 +INFO 2025-09-08 17:38:52 celerate.py:281 step:27K smpl:2M ep:11K epch:6.27 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.519 data_s:0.003 +INFO 2025-09-08 17:40:40 celerate.py:281 step:27K smpl:2M ep:11K epch:6.32 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.534 data_s:0.003 +INFO 2025-09-08 17:42:57 celerate.py:281 step:27K smpl:2M ep:11K epch:6.37 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 +.678 data_s:0.007 +INFO 2025-09-08 17:46:13 celerate.py:281 step:27K smpl:2M ep:11K epch:6.41 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.968 data_s:0.009 +INFO 2025-09-08 17:49:20 celerate.py:281 step:28K smpl:2M ep:11K epch:6.46 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 +.895 data_s:0.037 +INFO 2025-09-08 17:51:22 celerate.py:281 step:28K smpl:2M ep:11K epch:6.51 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 +.604 data_s:0.003 +INFO 2025-09-08 17:53:07 celerate.py:281 step:28K smpl:2M ep:11K epch:6.55 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 17:54:51 celerate.py:281 step:28K smpl:2M ep:11K epch:6.60 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 17:56:36 celerate.py:281 step:28K smpl:2M ep:11K epch:6.65 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.519 data_s:0.003 +INFO 2025-09-08 17:58:21 celerate.py:281 step:29K smpl:2M ep:11K epch:6.69 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 18:00:06 celerate.py:281 step:29K smpl:2M ep:11K epch:6.74 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.513 data_s:0.011 +INFO 2025-09-08 18:01:50 celerate.py:281 step:29K smpl:2M ep:11K epch:6.79 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.476 data_s:0.041 +INFO 2025-09-08 18:03:34 celerate.py:281 step:29K smpl:2M ep:12K epch:6.83 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 +.506 data_s:0.012 +INFO 2025-09-08 18:05:21 celerate.py:281 step:29K smpl:2M ep:12K epch:6.88 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0 +.455 data_s:0.075 +INFO 2025-09-08 18:07:04 celerate.py:281 step:30K smpl:2M ep:12K epch:6.93 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 18:08:47 celerate.py:281 step:30K smpl:2M ep:12K epch:6.97 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 +.509 data_s:0.003 +INFO 2025-09-08 18:10:33 celerate.py:281 step:30K smpl:2M ep:12K epch:7.02 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.422 data_s:0.105 +INFO 2025-09-08 18:12:19 celerate.py:281 step:30K smpl:2M ep:12K epch:7.07 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.347 data_s:0.182 +INFO 2025-09-08 18:14:05 celerate.py:281 step:30K smpl:2M ep:12K epch:7.11 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 +.473 data_s:0.053 +INFO 2025-09-08 18:15:52 celerate.py:281 step:31K smpl:2M ep:12K epch:7.16 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0 +.531 data_s:0.005 +INFO 2025-09-08 18:17:37 celerate.py:281 step:31K smpl:2M ep:12K epch:7.21 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.520 data_s:0.003 +INFO 2025-09-08 18:19:22 celerate.py:281 step:31K smpl:2M ep:12K epch:7.26 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 +.500 data_s:0.020 +INFO 2025-09-08 18:21:06 celerate.py:281 step:31K smpl:2M ep:12K epch:7.30 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0 +.511 data_s:0.009 +INFO 2025-09-08 18:22:50 celerate.py:281 step:31K smpl:2M ep:12K epch:7.35 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0 +.518 data_s:0.003 +INFO 2025-09-08 18:24:33 celerate.py:281 step:32K smpl:2M ep:13K epch:7.40 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.507 data_s:0.007 +INFO 2025-09-08 18:26:16 celerate.py:281 step:32K smpl:2M ep:13K epch:7.44 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0 +.463 data_s:0.047 +INFO 2025-09-08 18:27:59 celerate.py:281 step:32K smpl:2M ep:13K epch:7.49 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:0 +.509 data_s:0.007 +INFO 2025-09-08 18:29:43 celerate.py:281 step:32K smpl:2M ep:13K epch:7.54 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 18:31:26 celerate.py:281 step:32K smpl:2M ep:13K epch:7.58 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.511 data_s:0.003 +INFO 2025-09-08 18:33:11 celerate.py:281 step:33K smpl:2M ep:13K epch:7.63 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.518 data_s:0.003 +INFO 2025-09-08 18:34:57 celerate.py:281 step:33K smpl:2M ep:13K epch:7.68 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.512 data_s:0.016 +INFO 2025-09-08 18:36:41 celerate.py:281 step:33K smpl:2M ep:13K epch:7.72 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 18:38:25 celerate.py:281 step:33K smpl:2M ep:13K epch:7.77 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 18:40:07 celerate.py:281 step:33K smpl:2M ep:13K epch:7.82 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.506 data_s:0.003 +INFO 2025-09-08 18:41:51 celerate.py:281 step:34K smpl:2M ep:13K epch:7.86 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.509 data_s:0.006 +INFO 2025-09-08 18:43:36 celerate.py:281 step:34K smpl:2M ep:13K epch:7.91 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 18:45:19 celerate.py:281 step:34K smpl:2M ep:13K epch:7.96 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.511 data_s:0.003 +INFO 2025-09-08 18:47:05 celerate.py:281 step:34K smpl:2M ep:14K epch:8.00 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.495 data_s:0.035 +INFO 2025-09-08 18:48:51 celerate.py:281 step:34K smpl:2M ep:14K epch:8.05 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0 +.413 data_s:0.112 +INFO 2025-09-08 18:50:34 celerate.py:281 step:35K smpl:2M ep:14K epch:8.10 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 18:52:19 celerate.py:281 step:35K smpl:2M ep:14K epch:8.14 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.520 data_s:0.003 +INFO 2025-09-08 18:54:03 celerate.py:281 step:35K smpl:2M ep:14K epch:8.19 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 18:55:48 celerate.py:281 step:35K smpl:2M ep:14K epch:8.24 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 +.420 data_s:0.101 +INFO 2025-09-08 18:57:33 celerate.py:281 step:35K smpl:2M ep:14K epch:8.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 +.506 data_s:0.022 +INFO 2025-09-08 18:59:19 celerate.py:281 step:36K smpl:2M ep:14K epch:8.33 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 +.525 data_s:0.003 +INFO 2025-09-08 19:01:03 celerate.py:281 step:36K smpl:2M ep:14K epch:8.38 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 19:02:48 celerate.py:281 step:36K smpl:2M ep:14K epch:8.43 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 19:04:32 celerate.py:281 step:36K smpl:2M ep:14K epch:8.47 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 +.505 data_s:0.013 +INFO 2025-09-08 19:06:15 celerate.py:281 step:36K smpl:2M ep:14K epch:8.52 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.384 data_s:0.130 +INFO 2025-09-08 19:07:58 celerate.py:281 step:37K smpl:2M ep:15K epch:8.57 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 +.430 data_s:0.084 +INFO 2025-09-08 19:09:41 celerate.py:281 step:37K smpl:2M ep:15K epch:8.61 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.351 data_s:0.162 +INFO 2025-09-08 19:11:25 celerate.py:281 step:37K smpl:2M ep:15K epch:8.66 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0 +.337 data_s:0.181 +INFO 2025-09-08 19:13:08 celerate.py:281 step:37K smpl:2M ep:15K epch:8.71 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.336 data_s:0.177 +INFO 2025-09-08 19:14:52 celerate.py:281 step:37K smpl:2M ep:15K epch:8.75 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.344 data_s:0.174 +INFO 2025-09-08 19:16:35 celerate.py:281 step:38K smpl:2M ep:15K epch:8.80 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 +.332 data_s:0.182 +INFO 2025-09-08 19:18:18 celerate.py:281 step:38K smpl:2M ep:15K epch:8.85 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.332 data_s:0.182 +INFO 2025-09-08 19:20:01 celerate.py:281 step:38K smpl:2M ep:15K epch:8.89 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0 +.337 data_s:0.177 +INFO 2025-09-08 19:21:45 celerate.py:281 step:38K smpl:2M ep:15K epch:8.94 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.495 data_s:0.022 +INFO 2025-09-08 19:23:29 celerate.py:281 step:38K smpl:2M ep:15K epch:8.99 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 19:25:15 celerate.py:281 step:39K smpl:2M ep:15K epch:9.03 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.436 data_s:0.097 +INFO 2025-09-08 19:26:59 celerate.py:281 step:39K smpl:2M ep:15K epch:9.08 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 +.378 data_s:0.138 +INFO 2025-09-08 19:28:43 celerate.py:281 step:39K smpl:2M ep:15K epch:9.13 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:0 +.497 data_s:0.023 +INFO 2025-09-08 19:30:27 celerate.py:281 step:39K smpl:3M ep:16K epch:9.17 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 19:32:14 celerate.py:281 step:39K smpl:3M ep:16K epch:9.22 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.526 data_s:0.003 +INFO 2025-09-08 19:33:57 celerate.py:281 step:40K smpl:3M ep:16K epch:9.27 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.506 data_s:0.011 +INFO 2025-09-08 19:35:42 celerate.py:281 step:40K smpl:3M ep:16K epch:9.31 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0 +.455 data_s:0.066 +INFO 2025-09-08 19:37:26 celerate.py:281 step:40K smpl:3M ep:16K epch:9.36 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0 +.493 data_s:0.026 +INFO 2025-09-08 19:37:26 celerate.py:295 Checkpoint policy after step 40000 +INFO 2025-09-08 19:39:10 celerate.py:281 step:40K smpl:3M ep:16K epch:9.41 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 +.397 data_s:0.114 +INFO 2025-09-08 19:40:53 celerate.py:281 step:40K smpl:3M ep:16K epch:9.45 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0 +.344 data_s:0.168 +INFO 2025-09-08 19:42:37 celerate.py:281 step:41K smpl:3M ep:16K epch:9.50 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.480 data_s:0.036 +INFO 2025-09-08 19:44:21 celerate.py:281 step:41K smpl:3M ep:16K epch:9.55 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 19:46:05 celerate.py:281 step:41K smpl:3M ep:16K epch:9.60 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 19:47:49 celerate.py:281 step:41K smpl:3M ep:16K epch:9.64 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 19:49:33 celerate.py:281 step:41K smpl:3M ep:16K epch:9.69 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 19:51:17 celerate.py:281 step:42K smpl:3M ep:16K epch:9.74 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 19:53:00 celerate.py:281 step:42K smpl:3M ep:17K epch:9.78 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 19:54:44 celerate.py:281 step:42K smpl:3M ep:17K epch:9.83 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 19:56:28 celerate.py:281 step:42K smpl:3M ep:17K epch:9.88 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.512 data_s:0.003 +INFO 2025-09-08 19:58:11 celerate.py:281 step:42K smpl:3M ep:17K epch:9.92 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 19:59:55 celerate.py:281 step:43K smpl:3M ep:17K epch:9.97 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 20:01:42 celerate.py:281 step:43K smpl:3M ep:17K epch:10.02 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s: +0.476 data_s:0.057 +INFO 2025-09-08 20:03:25 celerate.py:281 step:43K smpl:3M ep:17K epch:10.06 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.471 data_s:0.043 +INFO 2025-09-08 20:05:09 celerate.py:281 step:43K smpl:3M ep:17K epch:10.11 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.515 data_s:0.004 +INFO 2025-09-08 20:06:53 celerate.py:281 step:43K smpl:3M ep:17K epch:10.16 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.505 data_s:0.013 +INFO 2025-09-08 20:08:36 celerate.py:281 step:44K smpl:3M ep:17K epch:10.20 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 20:10:20 celerate.py:281 step:44K smpl:3M ep:17K epch:10.25 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 20:12:04 celerate.py:281 step:44K smpl:3M ep:17K epch:10.30 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 20:13:47 celerate.py:281 step:44K smpl:3M ep:18K epch:10.34 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.503 data_s:0.011 +INFO 2025-09-08 20:15:31 celerate.py:281 step:44K smpl:3M ep:18K epch:10.39 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.416 data_s:0.102 +INFO 2025-09-08 20:17:15 celerate.py:281 step:45K smpl:3M ep:18K epch:10.44 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.502 data_s:0.017 +INFO 2025-09-08 20:18:58 celerate.py:281 step:45K smpl:3M ep:18K epch:10.48 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 20:20:41 celerate.py:281 step:45K smpl:3M ep:18K epch:10.53 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.496 data_s:0.017 +INFO 2025-09-08 20:22:24 celerate.py:281 step:45K smpl:3M ep:18K epch:10.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.493 data_s:0.022 +INFO 2025-09-08 20:24:08 celerate.py:281 step:45K smpl:3M ep:18K epch:10.63 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.485 data_s:0.031 +INFO 2025-09-08 20:25:52 celerate.py:281 step:46K smpl:3M ep:18K epch:10.67 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 20:27:36 celerate.py:281 step:46K smpl:3M ep:18K epch:10.72 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 20:29:19 celerate.py:281 step:46K smpl:3M ep:18K epch:10.77 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 20:31:04 celerate.py:281 step:46K smpl:3M ep:18K epch:10.81 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 20:32:47 celerate.py:281 step:46K smpl:3M ep:18K epch:10.86 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 20:34:32 celerate.py:281 step:47K smpl:3M ep:18K epch:10.91 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 20:36:15 celerate.py:281 step:47K smpl:3M ep:19K epch:10.95 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 20:38:01 celerate.py:281 step:47K smpl:3M ep:19K epch:11.00 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.416 data_s:0.114 +INFO 2025-09-08 20:39:45 celerate.py:281 step:47K smpl:3M ep:19K epch:11.05 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.429 data_s:0.086 +INFO 2025-09-08 20:41:28 celerate.py:281 step:47K smpl:3M ep:19K epch:11.09 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s: +0.409 data_s:0.106 +INFO 2025-09-08 20:43:12 celerate.py:281 step:48K smpl:3M ep:19K epch:11.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.470 data_s:0.050 +INFO 2025-09-08 20:44:56 celerate.py:281 step:48K smpl:3M ep:19K epch:11.19 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 20:46:41 celerate.py:281 step:48K smpl:3M ep:19K epch:11.23 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 20:48:25 celerate.py:281 step:48K smpl:3M ep:19K epch:11.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 20:50:08 celerate.py:281 step:48K smpl:3M ep:19K epch:11.33 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.505 data_s:0.009 +INFO 2025-09-08 20:51:52 celerate.py:281 step:49K smpl:3M ep:19K epch:11.37 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.449 data_s:0.067 +INFO 2025-09-08 20:53:35 celerate.py:281 step:49K smpl:3M ep:19K epch:11.42 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.420 data_s:0.094 +INFO 2025-09-08 20:55:19 celerate.py:281 step:49K smpl:3M ep:19K epch:11.47 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 20:57:01 celerate.py:281 step:49K smpl:3M ep:19K epch:11.51 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: +0.505 data_s:0.003 +INFO 2025-09-08 20:58:44 celerate.py:281 step:49K smpl:3M ep:20K epch:11.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 21:00:28 celerate.py:281 step:50K smpl:3M ep:20K epch:11.61 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 21:02:11 celerate.py:281 step:50K smpl:3M ep:20K epch:11.65 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.402 data_s:0.110 +INFO 2025-09-08 21:03:54 celerate.py:281 step:50K smpl:3M ep:20K epch:11.70 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.332 data_s:0.184 +INFO 2025-09-08 21:05:37 celerate.py:281 step:50K smpl:3M ep:20K epch:11.75 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s: +0.332 data_s:0.182 +INFO 2025-09-08 21:07:21 celerate.py:281 step:50K smpl:3M ep:20K epch:11.80 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.466 data_s:0.049 +INFO 2025-09-08 21:09:05 celerate.py:281 step:51K smpl:3M ep:20K epch:11.84 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:10:49 celerate.py:281 step:51K smpl:3M ep:20K epch:11.89 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: +0.512 data_s:0.004 +INFO 2025-09-08 21:12:32 celerate.py:281 step:51K smpl:3M ep:20K epch:11.94 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s: +0.484 data_s:0.032 +INFO 2025-09-08 21:14:17 celerate.py:281 step:51K smpl:3M ep:20K epch:11.98 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: +0.517 data_s:0.004 +INFO 2025-09-08 21:16:03 celerate.py:281 step:51K smpl:3M ep:20K epch:12.03 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.424 data_s:0.105 +INFO 2025-09-08 21:17:46 celerate.py:281 step:52K smpl:3M ep:20K epch:12.08 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.442 data_s:0.073 +INFO 2025-09-08 21:19:30 celerate.py:281 step:52K smpl:3M ep:21K epch:12.12 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s: +0.511 data_s:0.007 +INFO 2025-09-08 21:21:15 celerate.py:281 step:52K smpl:3M ep:21K epch:12.17 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.520 data_s:0.003 +INFO 2025-09-08 21:22:59 celerate.py:281 step:52K smpl:3M ep:21K epch:12.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 21:24:43 celerate.py:281 step:52K smpl:3M ep:21K epch:12.26 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 21:26:27 celerate.py:281 step:53K smpl:3M ep:21K epch:12.31 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 21:28:11 celerate.py:281 step:53K smpl:3M ep:21K epch:12.36 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:29:55 celerate.py:281 step:53K smpl:3M ep:21K epch:12.40 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 21:31:39 celerate.py:281 step:53K smpl:3M ep:21K epch:12.45 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:33:23 celerate.py:281 step:53K smpl:3M ep:21K epch:12.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 21:35:08 celerate.py:281 step:54K smpl:3M ep:21K epch:12.54 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.519 data_s:0.003 +INFO 2025-09-08 21:36:51 celerate.py:281 step:54K smpl:3M ep:21K epch:12.59 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 21:38:36 celerate.py:281 step:54K smpl:3M ep:21K epch:12.64 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:40:18 celerate.py:281 step:54K smpl:3M ep:21K epch:12.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.473 data_s:0.038 +INFO 2025-09-08 21:42:02 celerate.py:281 step:54K smpl:3M ep:22K epch:12.73 loss:0.085 grdn:0.232 lr:2.5e-06 updt_s: +0.408 data_s:0.109 +INFO 2025-09-08 21:43:45 celerate.py:281 step:55K smpl:3M ep:22K epch:12.78 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.377 data_s:0.136 +INFO 2025-09-08 21:45:29 celerate.py:281 step:55K smpl:4M ep:22K epch:12.83 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s: +0.497 data_s:0.022 +INFO 2025-09-08 21:47:12 celerate.py:281 step:55K smpl:4M ep:22K epch:12.87 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 21:48:56 celerate.py:281 step:55K smpl:4M ep:22K epch:12.92 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.429 data_s:0.086 +INFO 2025-09-08 21:50:39 celerate.py:281 step:55K smpl:4M ep:22K epch:12.97 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.454 data_s:0.059 +INFO 2025-09-08 21:52:25 celerate.py:281 step:56K smpl:4M ep:22K epch:13.01 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.459 data_s:0.072 +INFO 2025-09-08 21:54:08 celerate.py:281 step:56K smpl:4M ep:22K epch:13.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.382 data_s:0.132 +INFO 2025-09-08 21:55:51 celerate.py:281 step:56K smpl:4M ep:22K epch:13.11 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.500 data_s:0.016 +INFO 2025-09-08 21:57:36 celerate.py:281 step:56K smpl:4M ep:22K epch:13.15 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:59:20 celerate.py:281 step:56K smpl:4M ep:22K epch:13.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 22:01:03 celerate.py:281 step:57K smpl:4M ep:22K epch:13.25 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-08 22:02:48 celerate.py:281 step:57K smpl:4M ep:23K epch:13.29 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 22:04:32 celerate.py:281 step:57K smpl:4M ep:23K epch:13.34 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:06:14 celerate.py:281 step:57K smpl:4M ep:23K epch:13.39 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s: +0.505 data_s:0.005 +bINFO 2025-09-08 22:07:59 celerate.py:281 step:57K smpl:4M ep:23K epch:13.43 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s +:0.496 data_s:0.026 +INFO 2025-09-08 22:09:43 celerate.py:281 step:58K smpl:4M ep:23K epch:13.48 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.438 data_s:0.080 +INFO 2025-09-08 22:11:27 celerate.py:281 step:58K smpl:4M ep:23K epch:13.53 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: +0.444 data_s:0.073 +INFO 2025-09-08 22:13:11 celerate.py:281 step:58K smpl:4M ep:23K epch:13.57 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:14:55 celerate.py:281 step:58K smpl:4M ep:23K epch:13.62 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 22:16:39 celerate.py:281 step:58K smpl:4M ep:23K epch:13.67 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 22:18:22 celerate.py:281 step:59K smpl:4M ep:23K epch:13.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 22:20:05 celerate.py:281 step:59K smpl:4M ep:23K epch:13.76 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-08 22:21:48 celerate.py:281 step:59K smpl:4M ep:23K epch:13.81 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s: +0.505 data_s:0.008 +INFO 2025-09-08 22:23:31 celerate.py:281 step:59K smpl:4M ep:23K epch:13.85 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.505 data_s:0.008 +INFO 2025-09-08 22:25:15 celerate.py:281 step:59K smpl:4M ep:24K epch:13.90 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:26:58 celerate.py:281 step:60K smpl:4M ep:24K epch:13.95 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.493 data_s:0.022 +INFO 2025-09-08 22:28:42 celerate.py:281 step:60K smpl:4M ep:24K epch:14.00 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:30:29 celerate.py:281 step:60K smpl:4M ep:24K epch:14.04 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.435 data_s:0.101 +INFO 2025-09-08 22:30:29 celerate.py:295 Checkpoint policy after step 60000 +INFO 2025-09-08 22:32:14 celerate.py:281 step:60K smpl:4M ep:24K epch:14.09 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-08 22:33:58 celerate.py:281 step:60K smpl:4M ep:24K epch:14.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 22:35:42 celerate.py:281 step:61K smpl:4M ep:24K epch:14.18 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:37:25 celerate.py:281 step:61K smpl:4M ep:24K epch:14.23 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 22:39:09 celerate.py:281 step:61K smpl:4M ep:24K epch:14.28 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 22:40:52 celerate.py:281 step:61K smpl:4M ep:24K epch:14.32 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-08 22:42:35 celerate.py:281 step:61K smpl:4M ep:24K epch:14.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 22:44:18 celerate.py:281 step:62K smpl:4M ep:24K epch:14.42 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 22:46:02 celerate.py:281 step:62K smpl:4M ep:24K epch:14.46 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:47:47 celerate.py:281 step:62K smpl:4M ep:25K epch:14.51 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 22:49:30 celerate.py:281 step:62K smpl:4M ep:25K epch:14.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 22:51:14 celerate.py:281 step:62K smpl:4M ep:25K epch:14.60 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 22:52:58 celerate.py:281 step:63K smpl:4M ep:25K epch:14.65 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.484 data_s:0.033 +INFO 2025-09-08 22:54:41 celerate.py:281 step:63K smpl:4M ep:25K epch:14.70 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.501 data_s:0.016 +INFO 2025-09-08 22:56:25 celerate.py:281 step:63K smpl:4M ep:25K epch:14.74 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.436 data_s:0.081 +INFO 2025-09-08 22:58:08 celerate.py:281 step:63K smpl:4M ep:25K epch:14.79 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.436 data_s:0.080 +INFO 2025-09-08 22:59:51 celerate.py:281 step:63K smpl:4M ep:25K epch:14.84 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.344 data_s:0.168 +INFO 2025-09-08 23:01:34 celerate.py:281 step:64K smpl:4M ep:25K epch:14.88 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.479 data_s:0.035 +INFO 2025-09-08 23:03:18 celerate.py:281 step:64K smpl:4M ep:25K epch:14.93 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 23:05:02 celerate.py:281 step:64K smpl:4M ep:25K epch:14.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 23:06:47 celerate.py:281 step:64K smpl:4M ep:25K epch:15.02 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.405 data_s:0.119 +INFO 2025-09-08 23:08:30 celerate.py:281 step:64K smpl:4M ep:26K epch:15.07 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s: +0.393 data_s:0.121 +INFO 2025-09-08 23:10:13 celerate.py:281 step:65K smpl:4M ep:26K epch:15.12 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: +0.369 data_s:0.142 +INFO 2025-09-08 23:11:56 celerate.py:281 step:65K smpl:4M ep:26K epch:15.17 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s: +0.360 data_s:0.156 +INFO 2025-09-08 23:13:40 celerate.py:281 step:65K smpl:4M ep:26K epch:15.21 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.333 data_s:0.182 +INFO 2025-09-08 23:15:23 celerate.py:281 step:65K smpl:4M ep:26K epch:15.26 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.376 data_s:0.136 +INFO 2025-09-08 23:17:05 celerate.py:281 step:65K smpl:4M ep:26K epch:15.31 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.439 data_s:0.069 +INFO 2025-09-08 23:18:49 celerate.py:281 step:66K smpl:4M ep:26K epch:15.35 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.512 data_s:0.006 +INFO 2025-09-08 23:20:32 celerate.py:281 step:66K smpl:4M ep:26K epch:15.40 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 23:22:15 celerate.py:281 step:66K smpl:4M ep:26K epch:15.45 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.508 data_s:0.004 +INFO 2025-09-08 23:23:59 celerate.py:281 step:66K smpl:4M ep:26K epch:15.49 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.474 data_s:0.046 +INFO 2025-09-08 23:25:42 celerate.py:281 step:66K smpl:4M ep:26K epch:15.54 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-08 23:27:25 celerate.py:281 step:67K smpl:4M ep:26K epch:15.59 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.487 data_s:0.025 +INFO 2025-09-08 23:29:08 celerate.py:281 step:67K smpl:4M ep:26K epch:15.63 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.510 data_s:0.005 +INFO 2025-09-08 23:30:50 celerate.py:281 step:67K smpl:4M ep:27K epch:15.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.506 data_s:0.003 +INFO 2025-09-08 23:32:33 celerate.py:281 step:67K smpl:4M ep:27K epch:15.73 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-08 23:34:16 celerate.py:281 step:67K smpl:4M ep:27K epch:15.77 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s: +0.503 data_s:0.014 +INFO 2025-09-08 23:35:59 celerate.py:281 step:68K smpl:4M ep:27K epch:15.82 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 23:37:42 celerate.py:281 step:68K smpl:4M ep:27K epch:15.87 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-08 23:39:26 celerate.py:281 step:68K smpl:4M ep:27K epch:15.91 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 23:41:10 celerate.py:281 step:68K smpl:4M ep:27K epch:15.96 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 23:42:56 celerate.py:281 step:68K smpl:4M ep:27K epch:16.01 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.469 data_s:0.061 +INFO 2025-09-08 23:44:40 celerate.py:281 step:69K smpl:4M ep:27K epch:16.05 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.339 data_s:0.179 +INFO 2025-09-08 23:46:22 celerate.py:281 step:69K smpl:4M ep:27K epch:16.10 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.386 data_s:0.124 +INFO 2025-09-08 23:48:06 celerate.py:281 step:69K smpl:4M ep:27K epch:16.15 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 23:49:50 celerate.py:281 step:69K smpl:4M ep:27K epch:16.20 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 23:51:33 celerate.py:281 step:69K smpl:4M ep:27K epch:16.24 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s: +0.501 data_s:0.014 +INFO 2025-09-08 23:53:17 celerate.py:281 step:70K smpl:4M ep:28K epch:16.29 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 23:54:59 celerate.py:281 step:70K smpl:4M ep:28K epch:16.34 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.507 data_s:0.005 +INFO 2025-09-08 23:56:43 celerate.py:281 step:70K smpl:4M ep:28K epch:16.38 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 23:58:27 celerate.py:281 step:70K smpl:4M ep:28K epch:16.43 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 00:00:12 celerate.py:281 step:70K smpl:5M ep:28K epch:16.48 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-09 00:01:55 celerate.py:281 step:71K smpl:5M ep:28K epch:16.52 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 00:03:37 celerate.py:281 step:71K smpl:5M ep:28K epch:16.57 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-09 00:05:20 celerate.py:281 step:71K smpl:5M ep:28K epch:16.62 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-09 00:07:04 celerate.py:281 step:71K smpl:5M ep:28K epch:16.66 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.519 data_s:0.003 +INFO 2025-09-09 00:08:47 celerate.py:281 step:71K smpl:5M ep:28K epch:16.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-09 00:10:30 celerate.py:281 step:72K smpl:5M ep:28K epch:16.76 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 00:12:13 celerate.py:281 step:72K smpl:5M ep:28K epch:16.80 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 00:13:58 celerate.py:281 step:72K smpl:5M ep:29K epch:16.85 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.522 data_s:0.003 +INFO 2025-09-09 00:15:40 celerate.py:281 step:72K smpl:5M ep:29K epch:16.90 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.506 data_s:0.003 +INFO 2025-09-09 00:17:23 celerate.py:281 step:72K smpl:5M ep:29K epch:16.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.510 data_s:0.004 +INFO 2025-09-09 00:19:05 celerate.py:281 step:73K smpl:5M ep:29K epch:16.99 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.505 data_s:0.003 +INFO 2025-09-09 00:20:51 celerate.py:281 step:73K smpl:5M ep:29K epch:17.04 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.437 data_s:0.092 +INFO 2025-09-09 00:22:34 celerate.py:281 step:73K smpl:5M ep:29K epch:17.08 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.489 data_s:0.025 +INFO 2025-09-09 00:24:17 celerate.py:281 step:73K smpl:5M ep:29K epch:17.13 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 00:26:01 celerate.py:281 step:73K smpl:5M ep:29K epch:17.18 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 00:27:45 celerate.py:281 step:74K smpl:5M ep:29K epch:17.22 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 00:29:28 celerate.py:281 step:74K smpl:5M ep:29K epch:17.27 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-09 00:31:11 celerate.py:281 step:74K smpl:5M ep:29K epch:17.32 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 00:32:54 celerate.py:281 step:74K smpl:5M ep:29K epch:17.37 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 00:34:38 celerate.py:281 step:74K smpl:5M ep:29K epch:17.41 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 00:36:21 celerate.py:281 step:75K smpl:5M ep:30K epch:17.46 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 00:38:05 celerate.py:281 step:75K smpl:5M ep:30K epch:17.51 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 00:39:49 celerate.py:281 step:75K smpl:5M ep:30K epch:17.55 loss:0.086 grdn:0.251 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-09 00:41:32 celerate.py:281 step:75K smpl:5M ep:30K epch:17.60 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.506 data_s:0.005 +INFO 2025-09-09 00:43:14 celerate.py:281 step:75K smpl:5M ep:30K epch:17.65 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.390 data_s:0.122 +INFO 2025-09-09 00:44:58 celerate.py:281 step:76K smpl:5M ep:30K epch:17.69 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.410 data_s:0.107 +INFO 2025-09-09 00:46:41 celerate.py:281 step:76K smpl:5M ep:30K epch:17.74 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s: +0.427 data_s:0.085 +INFO 2025-09-09 00:48:24 celerate.py:281 step:76K smpl:5M ep:30K epch:17.79 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s: +0.492 data_s:0.025 +INFO 2025-09-09 00:50:08 celerate.py:281 step:76K smpl:5M ep:30K epch:17.83 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 00:51:51 celerate.py:281 step:76K smpl:5M ep:30K epch:17.88 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 00:53:33 celerate.py:281 step:77K smpl:5M ep:30K epch:17.93 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 00:55:18 celerate.py:281 step:77K smpl:5M ep:30K epch:17.97 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.519 data_s:0.003 +INFO 2025-09-09 00:57:05 celerate.py:281 step:77K smpl:5M ep:31K epch:18.02 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.486 data_s:0.046 +INFO 2025-09-09 00:58:47 celerate.py:281 step:77K smpl:5M ep:31K epch:18.07 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-09 01:00:31 celerate.py:281 step:77K smpl:5M ep:31K epch:18.11 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 01:02:15 celerate.py:281 step:78K smpl:5M ep:31K epch:18.16 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 01:03:56 celerate.py:281 step:78K smpl:5M ep:31K epch:18.21 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.498 data_s:0.008 +INFO 2025-09-09 01:05:40 celerate.py:281 step:78K smpl:5M ep:31K epch:18.25 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.508 data_s:0.010 +INFO 2025-09-09 01:07:24 celerate.py:281 step:78K smpl:5M ep:31K epch:18.30 loss:0.085 grdn:0.240 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 01:09:07 celerate.py:281 step:78K smpl:5M ep:31K epch:18.35 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:10:51 celerate.py:281 step:79K smpl:5M ep:31K epch:18.40 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 01:12:35 celerate.py:281 step:79K smpl:5M ep:31K epch:18.44 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 01:14:17 celerate.py:281 step:79K smpl:5M ep:31K epch:18.49 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-09 01:16:01 celerate.py:281 step:79K smpl:5M ep:31K epch:18.54 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:17:43 celerate.py:281 step:79K smpl:5M ep:31K epch:18.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.507 data_s:0.003 +INFO 2025-09-09 01:19:26 celerate.py:281 step:80K smpl:5M ep:32K epch:18.63 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:21:09 celerate.py:281 step:80K smpl:5M ep:32K epch:18.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:22:52 celerate.py:281 step:80K smpl:5M ep:32K epch:18.72 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.501 data_s:0.011 +INFO 2025-09-09 01:22:52 celerate.py:295 Checkpoint policy after step 80000 +INFO 2025-09-09 01:24:37 celerate.py:281 step:80K smpl:5M ep:32K epch:18.77 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 01:26:20 celerate.py:281 step:80K smpl:5M ep:32K epch:18.82 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 01:28:05 celerate.py:281 step:81K smpl:5M ep:32K epch:18.86 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-09 01:29:48 celerate.py:281 step:81K smpl:5M ep:32K epch:18.91 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:31:32 celerate.py:281 step:81K smpl:5M ep:32K epch:18.96 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-09 01:33:18 celerate.py:281 step:81K smpl:5M ep:32K epch:19.00 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s: +0.487 data_s:0.042 +INFO 2025-09-09 01:35:01 celerate.py:281 step:81K smpl:5M ep:32K epch:19.05 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.461 data_s:0.054 +INFO 2025-09-09 01:36:46 celerate.py:281 step:82K smpl:5M ep:32K epch:19.10 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.450 data_s:0.071 +INFO 2025-09-09 01:38:29 celerate.py:281 step:82K smpl:5M ep:32K epch:19.14 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.451 data_s:0.064 +INFO 2025-09-09 01:40:12 celerate.py:281 step:82K smpl:5M ep:32K epch:19.19 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 01:41:54 celerate.py:281 step:82K smpl:5M ep:33K epch:19.24 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s: +0.480 data_s:0.030 +INFO 2025-09-09 01:43:37 celerate.py:281 step:82K smpl:5M ep:33K epch:19.28 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.434 data_s:0.080 +INFO 2025-09-09 01:45:20 celerate.py:281 step:83K smpl:5M ep:33K epch:19.33 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:47:04 celerate.py:281 step:83K smpl:5M ep:33K epch:19.38 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:48:49 celerate.py:281 step:83K smpl:5M ep:33K epch:19.42 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.520 data_s:0.003 +INFO 2025-09-09 01:50:32 celerate.py:281 step:83K smpl:5M ep:33K epch:19.47 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:52:15 celerate.py:281 step:83K smpl:5M ep:33K epch:19.52 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:53:59 celerate.py:281 step:84K smpl:5M ep:33K epch:19.57 loss:0.085 grdn:0.236 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:55:41 celerate.py:281 step:84K smpl:5M ep:33K epch:19.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-09 01:57:24 celerate.py:281 step:84K smpl:5M ep:33K epch:19.66 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.500 data_s:0.013 +INFO 2025-09-09 01:59:07 celerate.py:281 step:84K smpl:5M ep:33K epch:19.71 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 02:00:50 celerate.py:281 step:84K smpl:5M ep:33K epch:19.75 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 02:02:34 celerate.py:281 step:85K smpl:5M ep:34K epch:19.80 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-09 02:04:17 celerate.py:281 step:85K smpl:5M ep:34K epch:19.85 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 02:05:59 celerate.py:281 step:85K smpl:5M ep:34K epch:19.89 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.507 data_s:0.004 +INFO 2025-09-09 02:07:43 celerate.py:281 step:85K smpl:5M ep:34K epch:19.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.505 data_s:0.011 +INFO 2025-09-09 02:09:26 celerate.py:281 step:85K smpl:5M ep:34K epch:19.99 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.392 data_s:0.124 +INFO 2025-09-09 02:11:14 celerate.py:281 step:86K smpl:5M ep:34K epch:20.03 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.435 data_s:0.100 +INFO 2025-09-09 02:12:57 celerate.py:281 step:86K smpl:5M ep:34K epch:20.08 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 02:14:41 celerate.py:281 step:86K smpl:6M ep:34K epch:20.13 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-09 02:16:25 celerate.py:281 step:86K smpl:6M ep:34K epch:20.17 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 02:18:09 celerate.py:281 step:86K smpl:6M ep:34K epch:20.22 loss:0.085 grdn:0.238 lr:2.5e-06 updt_s: +0.506 data_s:0.010 +INFO 2025-09-09 02:19:53 celerate.py:281 step:87K smpl:6M ep:34K epch:20.27 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 02:21:35 celerate.py:281 step:87K smpl:6M ep:34K epch:20.31 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.501 data_s:0.007 +INFO 2025-09-09 02:23:17 celerate.py:281 step:87K smpl:6M ep:34K epch:20.36 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s: +0.432 data_s:0.080 +INFO 2025-09-09 02:25:01 celerate.py:281 step:87K smpl:6M ep:35K epch:20.41 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.445 data_s:0.073 +INFO 2025-09-09 02:26:43 celerate.py:281 step:87K smpl:6M ep:35K epch:20.45 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: +0.401 data_s:0.107 +INFO 2025-09-09 02:28:26 celerate.py:281 step:88K smpl:6M ep:35K epch:20.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.502 data_s:0.009 +INFO 2025-09-09 02:30:09 celerate.py:281 step:88K smpl:6M ep:35K epch:20.55 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s: +0.491 data_s:0.021 +INFO 2025-09-09 02:31:52 celerate.py:281 step:88K smpl:6M ep:35K epch:20.59 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.391 data_s:0.124 +INFO 2025-09-09 02:33:35 celerate.py:281 step:88K smpl:6M ep:35K epch:20.64 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.332 data_s:0.180 +INFO 2025-09-09 02:35:18 celerate.py:281 step:88K smpl:6M ep:35K epch:20.69 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s: +0.333 data_s:0.181 +INFO 2025-09-09 02:37:02 celerate.py:281 step:89K smpl:6M ep:35K epch:20.74 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s: +0.332 data_s:0.185 +INFO 2025-09-09 02:38:47 celerate.py:281 step:89K smpl:6M ep:35K epch:20.78 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.332 data_s:0.190 +INFO 2025-09-09 02:40:30 celerate.py:281 step:89K smpl:6M ep:35K epch:20.83 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.330 data_s:0.187 +INFO 2025-09-09 02:42:12 celerate.py:281 step:89K smpl:6M ep:35K epch:20.88 loss:0.085 grdn:0.243 lr:2.5e-06 updt_s: +0.331 data_s:0.177 +INFO 2025-09-09 02:43:56 celerate.py:281 step:89K smpl:6M ep:35K epch:20.92 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.330 data_s:0.185 +INFO 2025-09-09 02:45:39 celerate.py:281 step:90K smpl:6M ep:36K epch:20.97 loss:0.087 grdn:0.252 lr:2.5e-06 updt_s: +0.335 data_s:0.182 +INFO 2025-09-09 02:47:25 celerate.py:281 step:90K smpl:6M ep:36K epch:21.02 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s: +0.330 data_s:0.197 +INFO 2025-09-09 02:49:08 celerate.py:281 step:90K smpl:6M ep:36K epch:21.06 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.330 data_s:0.182 +INFO 2025-09-09 02:50:51 celerate.py:281 step:90K smpl:6M ep:36K epch:21.11 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.334 data_s:0.181 +INFO 2025-09-09 02:52:34 celerate.py:281 step:90K smpl:6M ep:36K epch:21.16 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.330 data_s:0.183 +INFO 2025-09-09 02:54:17 celerate.py:281 step:91K smpl:6M ep:36K epch:21.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s: +0.342 data_s:0.173 +INFO 2025-09-09 02:56:01 celerate.py:281 step:91K smpl:6M ep:36K epch:21.25 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s: +0.331 data_s:0.189 +INFO 2025-09-09 02:57:44 celerate.py:281 step:91K smpl:6M ep:36K epch:21.30 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.329 data_s:0.182 +INFO 2025-09-09 02:59:27 celerate.py:281 step:91K smpl:6M ep:36K epch:21.34 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s: +0.341 data_s:0.175 +INFO 2025-09-09 03:01:11 celerate.py:281 step:91K smpl:6M ep:36K epch:21.39 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.330 data_s:0.188 +INFO 2025-09-09 03:02:54 celerate.py:281 step:92K smpl:6M ep:36K epch:21.44 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.328 data_s:0.186 +INFO 2025-09-09 03:04:39 celerate.py:281 step:92K smpl:6M ep:36K epch:21.48 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.329 data_s:0.195 +INFO 2025-09-09 03:06:22 celerate.py:281 step:92K smpl:6M ep:36K epch:21.53 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.330 data_s:0.183 +INFO 2025-09-09 03:08:04 celerate.py:281 step:92K smpl:6M ep:37K epch:21.58 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.462 data_s:0.051 +INFO 2025-09-09 03:09:48 celerate.py:281 step:92K smpl:6M ep:37K epch:21.62 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s: +0.407 data_s:0.108 +INFO 2025-09-09 03:11:32 celerate.py:281 step:93K smpl:6M ep:37K epch:21.67 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.333 data_s:0.185 +INFO 2025-09-09 03:13:15 celerate.py:281 step:93K smpl:6M ep:37K epch:21.72 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s: +0.329 data_s:0.187 +INFO 2025-09-09 03:14:58 celerate.py:281 step:93K smpl:6M ep:37K epch:21.77 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.357 data_s:0.156 +INFO 2025-09-09 03:16:41 celerate.py:281 step:93K smpl:6M ep:37K epch:21.81 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.487 data_s:0.027 +INFO 2025-09-09 03:18:25 celerate.py:281 step:93K smpl:6M ep:37K epch:21.86 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 03:20:08 celerate.py:281 step:94K smpl:6M ep:37K epch:21.91 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 03:21:51 celerate.py:281 step:94K smpl:6M ep:37K epch:21.95 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.508 data_s:0.004 +INFO 2025-09-09 03:23:38 celerate.py:281 step:94K smpl:6M ep:37K epch:22.00 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: +0.429 data_s:0.104 +INFO 2025-09-09 03:25:20 celerate.py:281 step:94K smpl:6M ep:37K epch:22.05 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.328 data_s:0.183 +INFO 2025-09-09 03:27:04 celerate.py:281 step:94K smpl:6M ep:37K epch:22.09 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s: +0.329 data_s:0.191 +INFO 2025-09-09 03:28:47 celerate.py:281 step:95K smpl:6M ep:37K epch:22.14 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s: +0.329 data_s:0.186 +INFO 2025-09-09 03:30:30 celerate.py:281 step:95K smpl:6M ep:38K epch:22.19 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.346 data_s:0.166 +INFO 2025-09-09 03:32:13 celerate.py:281 step:95K smpl:6M ep:38K epch:22.23 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s: +0.334 data_s:0.181 +INFO 2025-09-09 03:33:56 celerate.py:281 step:95K smpl:6M ep:38K epch:22.28 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.402 data_s:0.110 +INFO 2025-09-09 03:35:39 celerate.py:281 step:95K smpl:6M ep:38K epch:22.33 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s: +0.353 data_s:0.161 +INFO 2025-09-09 03:37:22 celerate.py:281 step:96K smpl:6M ep:38K epch:22.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.356 data_s:0.158 +INFO 2025-09-09 03:39:04 celerate.py:281 step:96K smpl:6M ep:38K epch:22.42 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.379 data_s:0.131 +INFO 2025-09-09 03:40:49 celerate.py:281 step:96K smpl:6M ep:38K epch:22.47 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: +0.344 data_s:0.175 +INFO 2025-09-09 03:42:32 celerate.py:281 step:96K smpl:6M ep:38K epch:22.51 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.331 data_s:0.185 +INFO 2025-09-09 03:44:15 celerate.py:281 step:96K smpl:6M ep:38K epch:22.56 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.331 data_s:0.183 +INFO 2025-09-09 03:45:58 celerate.py:281 step:97K smpl:6M ep:38K epch:22.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.330 data_s:0.184 +INFO 2025-09-09 03:47:43 celerate.py:281 step:97K smpl:6M ep:38K epch:22.65 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s: +0.331 data_s:0.188 +INFO 2025-09-09 03:49:27 celerate.py:281 step:97K smpl:6M ep:38K epch:22.70 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.333 data_s:0.185 +INFO 2025-09-09 03:51:10 celerate.py:281 step:97K smpl:6M ep:39K epch:22.75 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s: +0.330 data_s:0.185 +INFO 2025-09-09 03:52:54 celerate.py:281 step:97K smpl:6M ep:39K epch:22.79 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s: +0.330 data_s:0.192 +INFO 2025-09-09 03:54:37 celerate.py:281 step:98K smpl:6M ep:39K epch:22.84 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.329 data_s:0.185 +INFO 2025-09-09 03:56:21 celerate.py:281 step:98K smpl:6M ep:39K epch:22.89 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.329 data_s:0.187 +INFO 2025-09-09 03:58:04 celerate.py:281 step:98K smpl:6M ep:39K epch:22.94 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.329 data_s:0.185 +INFO 2025-09-09 03:59:46 celerate.py:281 step:98K smpl:6M ep:39K epch:22.98 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.329 data_s:0.183 +INFO 2025-09-09 04:01:32 celerate.py:281 step:98K smpl:6M ep:39K epch:23.03 loss:0.087 grdn:0.250 lr:2.5e-06 updt_s: +0.376 data_s:0.151 +INFO 2025-09-09 04:03:16 celerate.py:281 step:99K smpl:6M ep:39K epch:23.08 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.329 data_s:0.187 +INFO 2025-09-09 04:04:59 celerate.py:281 step:99K smpl:6M ep:39K epch:23.12 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.379 data_s:0.136 +INFO 2025-09-09 04:06:42 celerate.py:281 step:99K smpl:6M ep:39K epch:23.17 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 04:08:25 celerate.py:281 step:99K smpl:6M ep:39K epch:23.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 04:10:07 celerate.py:281 step:99K smpl:6M ep:39K epch:23.26 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: +0.470 data_s:0.039 +INFO 2025-09-09 04:11:50 celerate.py:281 step:100K smpl:6M ep:39K epch:23.31 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s +:0.347 data_s:0.169 +INFO 2025-09-09 04:13:34 celerate.py:281 step:100K smpl:6M ep:40K epch:23.36 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s +:0.330 data_s:0.185 +INFO 2025-09-09 04:15:17 celerate.py:281 step:100K smpl:6M ep:40K epch:23.40 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s +:0.356 data_s:0.156 +INFO 2025-09-09 04:15:17 celerate.py:295 Checkpoint policy after step 100000 +INFO 2025-09-09 04:15:18 celerate.py:359 End of training +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ tmux capture-pane -pS - > tmux_log.txt + + + + + + + + + +