From 4b7cd7211ab8ed307567994b897899d3843007ad Mon Sep 17 00:00:00 2001 From: Pepijn Date: Thu, 9 Oct 2025 15:11:47 +0200 Subject: [PATCH] add docs and only push model once --- docs/source/_toctree.yml | 2 + docs/source/multi_gpu_training.mdx | 96 ++++++++++++++++++++++++++++ src/lerobot/scripts/lerobot_train.py | 9 +-- 3 files changed, 103 insertions(+), 4 deletions(-) create mode 100644 docs/source/multi_gpu_training.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 36eaea165..9ee875f5c 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -19,6 +19,8 @@ title: Train RL in Simulation - local: async title: Use Async Inference + - local: multi_gpu_training + title: Multi GPU training title: "Tutorials" - sections: - local: lerobot-dataset-v3 diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx new file mode 100644 index 000000000..4cda9cb01 --- /dev/null +++ b/docs/source/multi_gpu_training.mdx @@ -0,0 +1,96 @@ +# Multi-GPU Training + +This guide shows you how to train policies on multiple GPUs using [Hugging Face Accelerate](https://huggingface.co/docs/accelerate). + +## Installation + +First, ensure you have accelerate installed: + +```bash +pip install accelerate +``` + +Or install it with the LeRobot accelerate extra: + +```bash +pip install lerobot[accelerate] +``` + +## Configuration + +Configure accelerate for your hardware setup by running: + +```bash +accelerate config +``` + +This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.). For a simple multi-GPU setup on a single machine, you can use these recommended settings: + +- Compute environment: This machine +- Number of machines: 1 +- Number of processes: (number of GPUs you want to use) +- GPU ids to use: (leave empty to use all) +- Mixed precision: fp16 or bf16 (recommended for faster training) + +## Training with Multiple GPUs + +You can launch training in two ways: + +### Option 1: Using accelerate config (recommended) + +If you ran `accelerate config`, simply launch with: + +```bash +accelerate launch $(which lerobot-train) \ + --dataset.repo_id=${HF_USER}/my_dataset \ + --policy.type=act \ + --output_dir=outputs/train/act_multi_gpu \ + --job_name=act_multi_gpu \ + --wandb.enable=true +``` + +### Option 2: Without config (specify parameters directly) + +If you prefer not to run `accelerate config`, you can specify all parameters in the command: + +```bash +accelerate launch \ + --multi_gpu \ + --num_processes=2 \ + --mixed_precision=fp16 \ + $(which lerobot-train) \ + --dataset.repo_id=${HF_USER}/my_dataset \ + --policy.type=act \ + --output_dir=outputs/train/act_multi_gpu \ + --job_name=act_multi_gpu \ + --wandb.enable=true +``` + +**Key accelerate parameters:** +- `--multi_gpu`: Enable multi-GPU training +- `--num_processes=2`: Number of GPUs to use +- `--mixed_precision=fp16`: Use fp16 mixed precision (or `bf16` if supported) + +## How It Works + +When you launch training with accelerate: + +1. **Automatic detection**: LeRobot automatically detects if it's running under accelerate +2. **Data distribution**: Your batch is automatically split across GPUs +3. **Gradient synchronization**: Gradients are synchronized across GPUs during backpropagation +4. **Single process logging**: Only the main process logs to wandb and saves checkpoints + +## Mixed Precision Training + +For faster training, you can enable mixed precision (fp16 or bf16). This is configured during `accelerate config` or by passing `--mixed_precision=fp16` to `accelerate launch`. LeRobot's `use_amp` setting is automatically handled when using accelerate. + +## Notes + +- The `--policy.use_amp` flag in `lerobot-train` is only used when **not** running with accelerate. When using accelerate, mixed precision is controlled by accelerate's configuration. +- Training logs, checkpoints, and hub uploads are only done by the main process to avoid conflicts. +- The effective batch size is `batch_size × num_gpus`. If you use 4 GPUs with `--batch_size=8`, your effective batch size is 32. +- Learning rate scheduling is handled correctly across multiple processes—LeRobot sets `step_scheduler_with_optimizer=False` to prevent accelerate from adjusting scheduler steps based on the number of processes. +- When saving or pushing models, LeRobot automatically unwraps the model from accelerate's distributed wrapper to ensure compatibility. + +For more advanced configurations and troubleshooting, see the [Accelerate documentation](https://huggingface.co/docs/accelerate). + diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 7d8b2853a..6202990ed 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -421,10 +421,11 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None): if not accelerator or accelerator.is_main_process: logging.info("End of training") - if cfg.policy.push_to_hub: - policy.push_model_to_hub(cfg) - preprocessor.push_to_hub(cfg.policy.repo_id) - postprocessor.push_to_hub(cfg.policy.repo_id) + if cfg.policy.push_to_hub: + unwrapped_policy = policy if not accelerator else accelerator.unwrap_model(policy) + unwrapped_policy.push_model_to_hub(cfg) + preprocessor.push_to_hub(cfg.policy.repo_id) + postprocessor.push_to_hub(cfg.policy.repo_id) def main():