Files
lerobot/test_direct_1gpu_local.sh
T
danaaubakirova d148279921 Support accelerate training and add test configs for SmolVLA
- 2-GPU SLURM job (distributed training)
- 1-GPU local accelerate and direct training scripts
- Accelerate configs for 1-GPU and 2-GPU setups
2025-09-04 13:07:25 +00:00

46 lines
1.2 KiB
Bash
Executable File

#!/bin/bash
echo "=== Direct 1-GPU Training Test with SmolVLA (no accelerate) ==="
echo "Environment: multi"
echo "GPU: 1"
echo "Steps: 50 (quick local test)"
echo ""
# Activate conda environment
source /fsx/dana_aubakirova/miniconda3/etc/profile.d/conda.sh
conda activate multi
# Set CUDA environment for 1 GPU
export CUDA_VISIBLE_DEVICES=0
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,expandable_segments:True
export TORCH_DISTRIBUTED_DEBUG=OFF
export CUDA_LAUNCH_BLOCKING=0
export TRANSFORMERS_NO_ADVISORY_WARNINGS=1
# Change to working directory
cd /fsx/dana_aubakirova/vla/pr/lerobot
# Set output directory with timestamp
export OUTPUT_DIR="outputs/test_direct_1gpu_local_$(date +%Y%m%d_%H%M%S)"
echo "Output directory: $OUTPUT_DIR"
echo ""
# Test direct training with 1 GPU (no accelerate)
python -m lerobot.scripts.train \
--policy.path=lerobot/smolvla_base \
--policy.push_to_hub=false \
--dataset.repo_id=lerobot/svla_so100_sorting \
--dataset.video_backend=pyav \
--steps=50 \
--save_freq=25 \
--log_freq=5 \
--batch_size=1 \
--num_workers=0 \
--output_dir=$OUTPUT_DIR \
--wandb.enable=false
echo ""
echo "=== Training completed! ==="
echo "Check outputs in: $OUTPUT_DIR"