add tests

This commit is contained in:
Pepijn
2025-10-13 16:25:46 +02:00
parent a74affad7c
commit c711a628b9
3 changed files with 270 additions and 1 deletions
+32
View File
@@ -158,3 +158,35 @@ jobs:
run: pytest tests -vv --maxfail=10
- name: Run end-to-end tests
run: make test-end-to-end
# This job runs multi-GPU training tests with 4 GPUs
nightly-multi-gpu-tests:
name: Nightly Multi-GPU Tests
needs: [build-docker-gpu-nightly]
runs-on:
group: aws-g6-12xlarge-plus # Instance with 4 GPUs
env:
HF_HOME: /home/user_lerobot/.cache/huggingface
HF_LEROBOT_HOME: /home/user_lerobot/.cache/huggingface/lerobot
TORCH_HOME: /home/user_lerobot/.cache/torch
TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
CUDA_VISIBLE_DEVICES: "0,1,2,3"
container:
image: ${{ needs.build-docker-gpu-nightly.outputs.image_tag }} # zizmor: ignore[unpinned-images]
options: --gpus all --shm-size "16gb"
credentials:
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
defaults:
run:
shell: bash
working-directory: /lerobot
steps:
- name: Verify GPU availability
run: |
nvidia-smi
python -c "import torch; print(f'PyTorch CUDA available: {torch.cuda.is_available()}'); print(f'Number of GPUs: {torch.cuda.device_count()}')"
- name: Run multi-GPU training tests
run: pytest tests/training/test_multi_gpu.py -vv --maxfail=3
timeout-minutes: 10
+41 -1
View File
@@ -89,6 +89,46 @@ When you launch training with accelerate:
For faster training, you can enable mixed precision (fp16 or bf16). This is configured during `accelerate config` or by passing `--mixed_precision=fp16` to `accelerate launch`. LeRobot's `use_amp` setting is automatically handled when using accelerate.
## Learning Rate and Training Steps Scaling
**Important:** LeRobot does **NOT** automatically scale learning rates or training steps based on the number of GPUs. This gives you full control over your training hyperparameters.
### Why No Automatic Scaling?
Many distributed training frameworks automatically scale the learning rate by the number of GPUs (e.g., `lr = base_lr × num_gpus`).
However, LeRobot keeps the learning rate exactly as you specify it.
### When and How to Scale
If you want to scale your hyperparameters when using multiple GPUs, you should do it manually:
**Learning Rate Scaling:**
```bash
# Example: 2 GPUs with linear LR scaling
# Base LR: 1e-4, with 2 GPUs -> 2e-4
accelerate launch --num_processes=2 $(which lerobot-train) \
--optimizer.lr=2e-4 \
--dataset.repo_id=lerobot/pusht \
--policy=act
```
**Training Steps Scaling:**
Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally:
#TODO(pepijn): verify this (bs scaling)
```bash
# Example: 2 GPUs with effective batch size 2x larger
# Original: batch_size=8, steps=100000
# With 2 GPUs: batch_size=8 (16 in total), steps=50000
accelerate launch --num_processes=2 $(which lerobot-train) \
--batch_size=8 \
--steps=50000 \
--dataset.repo_id=lerobot/pusht \
--policy=act
```
## Notes
- The `--policy.use_amp` flag in `lerobot-train` is only used when **not** running with accelerate. When using accelerate, mixed precision is controlled by accelerate's configuration.
@@ -98,4 +138,4 @@ For faster training, you can enable mixed precision (fp16 or bf16). This is conf
- When saving or pushing models, LeRobot automatically unwraps the model from accelerate's distributed wrapper to ensure compatibility.
- WandB integration automatically initializes only on the main process, preventing multiple runs from being created.
For more advanced configurations and troubleshooting, see the [Accelerate documentation](https://huggingface.co/docs/accelerate).
For more advanced configurations and troubleshooting, see the [Accelerate documentation](https://huggingface.co/docs/accelerate). If you want to learn more about how to train on a large number of GPUs, checkout this awesome guide: [Ultrascale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook).
+197
View File
@@ -0,0 +1,197 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Multi-GPU Training Tests
This module tests multi-GPU training functionality with accelerate.
These tests are designed to run on machines with 2+ GPUs and are executed
in the nightly CI workflow.
The tests automatically generate accelerate configs and launch training
with subprocess to properly test the distributed training environment.
"""
import json
import os
import shutil
import subprocess
import tempfile
from pathlib import Path
import pytest
import torch
def get_num_available_gpus():
"""Returns the number of available GPUs."""
if not torch.cuda.is_available():
return 0
return torch.cuda.device_count()
def run_accelerate_training(config_args, num_processes=4, temp_dir=None):
"""
Helper function to run training with accelerate launch.
Args:
config_args: List of config arguments to pass to lerobot_train.py
num_processes: Number of processes (GPUs) to use
temp_dir: Temporary directory for outputs
Returns:
subprocess.CompletedProcess result
"""
# Create accelerate config
accelerate_config = {
"compute_environment": "LOCAL_MACHINE",
"distributed_type": "MULTI_GPU",
"mixed_precision": "no",
"num_processes": num_processes,
"use_cpu": False,
"gpu_ids": "all",
}
config_path = Path(temp_dir) / "accelerate_config.yaml"
# Write YAML config
with open(config_path, "w") as f:
f.write("compute_environment: LOCAL_MACHINE\n")
f.write("distributed_type: MULTI_GPU\n")
f.write("mixed_precision: 'no'\n")
f.write(f"num_processes: {num_processes}\n")
f.write("use_cpu: false\n")
f.write("gpu_ids: all\n")
f.write("downcast_bf16: 'no'\n")
f.write("machine_rank: 0\n")
f.write("main_training_function: main\n")
f.write("num_machines: 1\n")
f.write("rdzv_backend: static\n")
f.write("same_network: true\n")
cmd = [
"accelerate",
"launch",
"--config_file",
str(config_path),
"-m",
"lerobot.scripts.lerobot_train",
] + config_args
result = subprocess.run(
cmd,
capture_output=True,
text=True,
env={**os.environ, "CUDA_VISIBLE_DEVICES": ",".join(map(str, range(num_processes)))},
)
return result
@pytest.mark.skipif(
get_num_available_gpus() < 2,
reason="Multi-GPU tests require at least 2 GPUs",
)
class TestMultiGPUTraining:
"""Test suite for multi-GPU training functionality."""
def test_basic_multi_gpu_training(self):
"""
Test that basic multi-GPU training runs successfully.
Verifies that the training completes without errors.
"""
with tempfile.TemporaryDirectory() as temp_dir:
output_dir = Path(temp_dir) / "outputs"
config_args = [
"--dataset.repo_id=lerobot/pusht",
"--dataset.episodes=[0]",
"--policy=act",
"--policy.device=cuda",
f"--output_dir={output_dir}",
"--batch_size=4",
"--steps=10",
"--eval_freq=-1",
"--log_freq=5",
"--save_freq=10",
"--seed=42",
]
result = run_accelerate_training(config_args, num_processes=4, temp_dir=temp_dir)
# Check that training completed successfully
assert result.returncode == 0, (
f"Multi-GPU training failed with return code {result.returncode}\n"
f"STDOUT:\n{result.stdout}\n"
f"STDERR:\n{result.stderr}"
)
# Verify checkpoint was saved
checkpoints_dir = output_dir / "checkpoints"
assert checkpoints_dir.exists(), "Checkpoints directory was not created"
# Verify that training completed
assert "End of training" in result.stdout or "End of training" in result.stderr
def test_checkpoint_saving_multi_gpu(self):
"""
Test that checkpoints are correctly saved during multi-GPU training.
Only the main process (rank 0) should save checkpoints.
"""
with tempfile.TemporaryDirectory() as temp_dir:
output_dir = Path(temp_dir) / "outputs"
config_args = [
"--dataset.repo_id=lerobot/pusht",
"--dataset.episodes=[0]",
"--policy=act",
"--policy.device=cuda",
f"--output_dir={output_dir}",
"--batch_size=4",
"--steps=20",
"--eval_freq=-1",
"--log_freq=5",
"--save_freq=10",
"--seed=42",
]
result = run_accelerate_training(config_args, num_processes=2, temp_dir=temp_dir)
assert result.returncode == 0, (
f"Training failed:\nSTDOUT:\n{result.stdout}\n\nSTDERR:\n{result.stderr}"
)
# Verify checkpoint directory exists
checkpoints_dir = output_dir / "checkpoints"
assert checkpoints_dir.exists(), "Checkpoints directory not created"
# Count checkpoint directories (should have checkpoint at step 10 and 20)
checkpoint_dirs = [d for d in checkpoints_dir.iterdir() if d.is_dir()]
assert len(checkpoint_dirs) >= 1, f"Expected at least 1 checkpoint, found {len(checkpoint_dirs)}"
# Verify checkpoint contents
for checkpoint_dir in checkpoint_dirs:
# Check for model files
model_files = list(checkpoint_dir.rglob("*.safetensors"))
assert len(model_files) > 0, f"No model files in checkpoint {checkpoint_dir}"
# Check for training state
training_state_dir = checkpoint_dir / "training_state"
assert training_state_dir.exists(), f"No training state in checkpoint {checkpoint_dir}"
# Verify optimizer state exists
optimizer_state = training_state_dir / "optimizer_state.pt"
assert optimizer_state.exists(), f"No optimizer state in checkpoint {checkpoint_dir}"