mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-15 08:39:49 +00:00
d148279921
- 2-GPU SLURM job (distributed training) - 1-GPU local accelerate and direct training scripts - Accelerate configs for 1-GPU and 2-GPU setups
46 lines
1.2 KiB
Bash
Executable File
46 lines
1.2 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
echo "=== Direct 1-GPU Training Test with SmolVLA (no accelerate) ==="
|
|
echo "Environment: multi"
|
|
echo "GPU: 1"
|
|
echo "Steps: 50 (quick local test)"
|
|
echo ""
|
|
|
|
# Activate conda environment
|
|
source /fsx/dana_aubakirova/miniconda3/etc/profile.d/conda.sh
|
|
conda activate multi
|
|
|
|
# Set CUDA environment for 1 GPU
|
|
export CUDA_VISIBLE_DEVICES=0
|
|
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,expandable_segments:True
|
|
export TORCH_DISTRIBUTED_DEBUG=OFF
|
|
export CUDA_LAUNCH_BLOCKING=0
|
|
export TRANSFORMERS_NO_ADVISORY_WARNINGS=1
|
|
|
|
# Change to working directory
|
|
cd /fsx/dana_aubakirova/vla/pr/lerobot
|
|
|
|
# Set output directory with timestamp
|
|
export OUTPUT_DIR="outputs/test_direct_1gpu_local_$(date +%Y%m%d_%H%M%S)"
|
|
|
|
echo "Output directory: $OUTPUT_DIR"
|
|
echo ""
|
|
|
|
# Test direct training with 1 GPU (no accelerate)
|
|
python -m lerobot.scripts.train \
|
|
--policy.path=lerobot/smolvla_base \
|
|
--policy.push_to_hub=false \
|
|
--dataset.repo_id=lerobot/svla_so100_sorting \
|
|
--dataset.video_backend=pyav \
|
|
--steps=50 \
|
|
--save_freq=25 \
|
|
--log_freq=5 \
|
|
--batch_size=1 \
|
|
--num_workers=0 \
|
|
--output_dir=$OUTPUT_DIR \
|
|
--wandb.enable=false
|
|
|
|
echo ""
|
|
echo "=== Training completed! ==="
|
|
echo "Check outputs in: $OUTPUT_DIR"
|