diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 03f26a792..812d91fa8 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -158,3 +158,35 @@ jobs: run: pytest tests -vv --maxfail=10 - name: Run end-to-end tests run: make test-end-to-end + + # This job runs multi-GPU training tests with 4 GPUs + nightly-multi-gpu-tests: + name: Nightly Multi-GPU Tests + needs: [build-docker-gpu-nightly] + runs-on: + group: aws-g6-12xlarge-plus # Instance with 4 GPUs + env: + HF_HOME: /home/user_lerobot/.cache/huggingface + HF_LEROBOT_HOME: /home/user_lerobot/.cache/huggingface/lerobot + TORCH_HOME: /home/user_lerobot/.cache/torch + TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton + CUDA_VISIBLE_DEVICES: "0,1,2,3" + container: + image: ${{ needs.build-docker-gpu-nightly.outputs.image_tag }} # zizmor: ignore[unpinned-images] + options: --gpus all --shm-size "16gb" + credentials: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + defaults: + run: + shell: bash + working-directory: /lerobot + steps: + - name: Verify GPU availability + run: | + nvidia-smi + python -c "import torch; print(f'PyTorch CUDA available: {torch.cuda.is_available()}'); print(f'Number of GPUs: {torch.cuda.device_count()}')" + + - name: Run multi-GPU training tests + run: pytest tests/training/test_multi_gpu.py -vv --maxfail=3 + timeout-minutes: 10 diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx index 03afc6a3a..497deb462 100644 --- a/docs/source/multi_gpu_training.mdx +++ b/docs/source/multi_gpu_training.mdx @@ -89,6 +89,46 @@ When you launch training with accelerate: For faster training, you can enable mixed precision (fp16 or bf16). This is configured during `accelerate config` or by passing `--mixed_precision=fp16` to `accelerate launch`. LeRobot's `use_amp` setting is automatically handled when using accelerate. +## Learning Rate and Training Steps Scaling + +**Important:** LeRobot does **NOT** automatically scale learning rates or training steps based on the number of GPUs. This gives you full control over your training hyperparameters. + +### Why No Automatic Scaling? + +Many distributed training frameworks automatically scale the learning rate by the number of GPUs (e.g., `lr = base_lr × num_gpus`). +However, LeRobot keeps the learning rate exactly as you specify it. + +### When and How to Scale + +If you want to scale your hyperparameters when using multiple GPUs, you should do it manually: + +**Learning Rate Scaling:** + +```bash +# Example: 2 GPUs with linear LR scaling +# Base LR: 1e-4, with 2 GPUs -> 2e-4 +accelerate launch --num_processes=2 $(which lerobot-train) \ + --optimizer.lr=2e-4 \ + --dataset.repo_id=lerobot/pusht \ + --policy=act +``` + +**Training Steps Scaling:** + +Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally: + +#TODO(pepijn): verify this (bs scaling) +```bash +# Example: 2 GPUs with effective batch size 2x larger +# Original: batch_size=8, steps=100000 +# With 2 GPUs: batch_size=8 (16 in total), steps=50000 +accelerate launch --num_processes=2 $(which lerobot-train) \ + --batch_size=8 \ + --steps=50000 \ + --dataset.repo_id=lerobot/pusht \ + --policy=act +``` + ## Notes - The `--policy.use_amp` flag in `lerobot-train` is only used when **not** running with accelerate. When using accelerate, mixed precision is controlled by accelerate's configuration. @@ -98,4 +138,4 @@ For faster training, you can enable mixed precision (fp16 or bf16). This is conf - When saving or pushing models, LeRobot automatically unwraps the model from accelerate's distributed wrapper to ensure compatibility. - WandB integration automatically initializes only on the main process, preventing multiple runs from being created. -For more advanced configurations and troubleshooting, see the [Accelerate documentation](https://huggingface.co/docs/accelerate). +For more advanced configurations and troubleshooting, see the [Accelerate documentation](https://huggingface.co/docs/accelerate). If you want to learn more about how to train on a large number of GPUs, checkout this awesome guide: [Ultrascale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook). diff --git a/tests/training/test_multi_gpu.py b/tests/training/test_multi_gpu.py new file mode 100644 index 000000000..f0668a8ef --- /dev/null +++ b/tests/training/test_multi_gpu.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Multi-GPU Training Tests + +This module tests multi-GPU training functionality with accelerate. +These tests are designed to run on machines with 2+ GPUs and are executed +in the nightly CI workflow. + +The tests automatically generate accelerate configs and launch training +with subprocess to properly test the distributed training environment. +""" + +import json +import os +import shutil +import subprocess +import tempfile +from pathlib import Path + +import pytest +import torch + + +def get_num_available_gpus(): + """Returns the number of available GPUs.""" + if not torch.cuda.is_available(): + return 0 + return torch.cuda.device_count() + + +def run_accelerate_training(config_args, num_processes=4, temp_dir=None): + """ + Helper function to run training with accelerate launch. + + Args: + config_args: List of config arguments to pass to lerobot_train.py + num_processes: Number of processes (GPUs) to use + temp_dir: Temporary directory for outputs + + Returns: + subprocess.CompletedProcess result + """ + # Create accelerate config + accelerate_config = { + "compute_environment": "LOCAL_MACHINE", + "distributed_type": "MULTI_GPU", + "mixed_precision": "no", + "num_processes": num_processes, + "use_cpu": False, + "gpu_ids": "all", + } + + config_path = Path(temp_dir) / "accelerate_config.yaml" + + # Write YAML config + with open(config_path, "w") as f: + f.write("compute_environment: LOCAL_MACHINE\n") + f.write("distributed_type: MULTI_GPU\n") + f.write("mixed_precision: 'no'\n") + f.write(f"num_processes: {num_processes}\n") + f.write("use_cpu: false\n") + f.write("gpu_ids: all\n") + f.write("downcast_bf16: 'no'\n") + f.write("machine_rank: 0\n") + f.write("main_training_function: main\n") + f.write("num_machines: 1\n") + f.write("rdzv_backend: static\n") + f.write("same_network: true\n") + + cmd = [ + "accelerate", + "launch", + "--config_file", + str(config_path), + "-m", + "lerobot.scripts.lerobot_train", + ] + config_args + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + env={**os.environ, "CUDA_VISIBLE_DEVICES": ",".join(map(str, range(num_processes)))}, + ) + + return result + + +@pytest.mark.skipif( + get_num_available_gpus() < 2, + reason="Multi-GPU tests require at least 2 GPUs", +) +class TestMultiGPUTraining: + """Test suite for multi-GPU training functionality.""" + + def test_basic_multi_gpu_training(self): + """ + Test that basic multi-GPU training runs successfully. + Verifies that the training completes without errors. + """ + with tempfile.TemporaryDirectory() as temp_dir: + output_dir = Path(temp_dir) / "outputs" + + config_args = [ + "--dataset.repo_id=lerobot/pusht", + "--dataset.episodes=[0]", + "--policy=act", + "--policy.device=cuda", + f"--output_dir={output_dir}", + "--batch_size=4", + "--steps=10", + "--eval_freq=-1", + "--log_freq=5", + "--save_freq=10", + "--seed=42", + ] + + result = run_accelerate_training(config_args, num_processes=4, temp_dir=temp_dir) + + # Check that training completed successfully + assert result.returncode == 0, ( + f"Multi-GPU training failed with return code {result.returncode}\n" + f"STDOUT:\n{result.stdout}\n" + f"STDERR:\n{result.stderr}" + ) + + # Verify checkpoint was saved + checkpoints_dir = output_dir / "checkpoints" + assert checkpoints_dir.exists(), "Checkpoints directory was not created" + + # Verify that training completed + assert "End of training" in result.stdout or "End of training" in result.stderr + + def test_checkpoint_saving_multi_gpu(self): + """ + Test that checkpoints are correctly saved during multi-GPU training. + Only the main process (rank 0) should save checkpoints. + """ + with tempfile.TemporaryDirectory() as temp_dir: + output_dir = Path(temp_dir) / "outputs" + + config_args = [ + "--dataset.repo_id=lerobot/pusht", + "--dataset.episodes=[0]", + "--policy=act", + "--policy.device=cuda", + f"--output_dir={output_dir}", + "--batch_size=4", + "--steps=20", + "--eval_freq=-1", + "--log_freq=5", + "--save_freq=10", + "--seed=42", + ] + + result = run_accelerate_training(config_args, num_processes=2, temp_dir=temp_dir) + + assert result.returncode == 0, ( + f"Training failed:\nSTDOUT:\n{result.stdout}\n\nSTDERR:\n{result.stderr}" + ) + + # Verify checkpoint directory exists + checkpoints_dir = output_dir / "checkpoints" + assert checkpoints_dir.exists(), "Checkpoints directory not created" + + # Count checkpoint directories (should have checkpoint at step 10 and 20) + checkpoint_dirs = [d for d in checkpoints_dir.iterdir() if d.is_dir()] + assert len(checkpoint_dirs) >= 1, f"Expected at least 1 checkpoint, found {len(checkpoint_dirs)}" + + # Verify checkpoint contents + for checkpoint_dir in checkpoint_dirs: + # Check for model files + model_files = list(checkpoint_dir.rglob("*.safetensors")) + assert len(model_files) > 0, f"No model files in checkpoint {checkpoint_dir}" + + # Check for training state + training_state_dir = checkpoint_dir / "training_state" + assert training_state_dir.exists(), f"No training state in checkpoint {checkpoint_dir}" + + # Verify optimizer state exists + optimizer_state = training_state_dir / "optimizer_state.pt" + assert optimizer_state.exists(), f"No optimizer state in checkpoint {checkpoint_dir}"