mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-17 09:39:47 +00:00
Support accelerate training and add test configs for SmolVLA
- 2-GPU SLURM job (distributed training) - 1-GPU local accelerate and direct training scripts - Accelerate configs for 1-GPU and 2-GPU setups
This commit is contained in:
@@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=test_accelerate
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --cpus-per-task=16
|
||||
#SBATCH --gres=gpu:2
|
||||
#SBATCH --time=1:00:00
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --output=/fsx/dana_aubakirova/vla/logs/test_accelerate_%j.out
|
||||
#SBATCH --error=/fsx/dana_aubakirova/vla/logs/test_accelerate_%j.err
|
||||
|
||||
# Create logs directory if it doesn't exist
|
||||
mkdir -p /fsx/dana_aubakirova/vla/pr/lerobot/logs
|
||||
|
||||
# Activate conda environment
|
||||
source /fsx/dana_aubakirova/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate multi
|
||||
|
||||
# 2-GPU Test CUDA environment
|
||||
export CUDA_VISIBLE_DEVICES=0,1
|
||||
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,expandable_segments:True
|
||||
export TORCH_DISTRIBUTED_DEBUG=OFF
|
||||
export NCCL_DEBUG=INFO
|
||||
export CUDA_LAUNCH_BLOCKING=0
|
||||
export ACCELERATE_USE_FSDP=false
|
||||
export ACCELERATE_USE_DEEPSPEED=false
|
||||
export HF_ACCELERATE_DEVICE_MAP=false
|
||||
export TRANSFORMERS_NO_ADVISORY_WARNINGS=1
|
||||
export SAFETENSORS_FAST_GPU=1
|
||||
export HF_HUB_ENABLE_HF_TRANSFER=1
|
||||
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||
export ACCELERATE_TORCH_DEVICE_MAP_AUTO=false
|
||||
|
||||
# Change to working directory
|
||||
cd /fsx/dana_aubakirova/vla/pr/lerobot
|
||||
|
||||
echo "=== Testing Accelerate Multi-GPU Training with SmolVLA ==="
|
||||
echo "Dataset: lerobot/svla_so100_sorting"
|
||||
echo "GPUs: 2"
|
||||
echo "Steps: 100 (for quick test)"
|
||||
echo "Job ID: $SLURM_JOB_ID"
|
||||
echo ""
|
||||
|
||||
# Set output directory with job ID
|
||||
export OUTPUT_DIR="outputs/test_accelerate_2gpu_job_${SLURM_JOB_ID}"
|
||||
|
||||
echo "Output directory: $OUTPUT_DIR"
|
||||
echo ""
|
||||
|
||||
# Test accelerate training
|
||||
accelerate launch --config_file accelerate_configs/2gpu_config_safe.yaml -m lerobot.scripts.train \
|
||||
--policy.type=smolvla \
|
||||
--policy.push_to_hub=false \
|
||||
--dataset.repo_id=lerobot/svla_so100_sorting \
|
||||
--dataset.video_backend=pyav \
|
||||
--steps=100 \
|
||||
--save_freq=50 \
|
||||
--log_freq=5 \
|
||||
--batch_size=2 \
|
||||
--num_workers=0 \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--wandb.enable=false
|
||||
|
||||
echo ""
|
||||
echo "=== Training completed! ==="
|
||||
echo "Check logs and outputs in: $OUTPUT_DIR"
|
||||
echo "Job ID: $SLURM_JOB_ID"
|
||||
Reference in New Issue
Block a user