diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 03f26a792..812d91fa8 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -158,3 +158,35 @@ jobs:
         run: pytest tests -vv --maxfail=10
       - name: Run end-to-end tests
         run: make test-end-to-end
+
+  # This job runs multi-GPU training tests with 4 GPUs
+  nightly-multi-gpu-tests:
+    name: Nightly Multi-GPU Tests
+    needs: [build-docker-gpu-nightly]
+    runs-on:
+      group: aws-g6-12xlarge-plus  # Instance with 4 GPUs
+    env:
+      HF_HOME: /home/user_lerobot/.cache/huggingface
+      HF_LEROBOT_HOME: /home/user_lerobot/.cache/huggingface/lerobot
+      TORCH_HOME: /home/user_lerobot/.cache/torch
+      TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
+      CUDA_VISIBLE_DEVICES: "0,1,2,3"
+    container:
+      image: ${{ needs.build-docker-gpu-nightly.outputs.image_tag }} # zizmor: ignore[unpinned-images]
+      options: --gpus all --shm-size "16gb"
+      credentials:
+        username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+    defaults:
+      run:
+        shell: bash
+        working-directory: /lerobot
+    steps:
+      - name: Verify GPU availability
+        run: |
+          nvidia-smi
+          python -c "import torch; print(f'PyTorch CUDA available: {torch.cuda.is_available()}'); print(f'Number of GPUs: {torch.cuda.device_count()}')"
+      
+      - name: Run multi-GPU training tests
+        run: pytest tests/training/test_multi_gpu.py -vv --maxfail=3
+        timeout-minutes: 10
diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx
index 03afc6a3a..497deb462 100644
--- a/docs/source/multi_gpu_training.mdx
+++ b/docs/source/multi_gpu_training.mdx
@@ -89,6 +89,46 @@ When you launch training with accelerate:
 
 For faster training, you can enable mixed precision (fp16 or bf16). This is configured during `accelerate config` or by passing `--mixed_precision=fp16` to `accelerate launch`. LeRobot's `use_amp` setting is automatically handled when using accelerate.
 
+## Learning Rate and Training Steps Scaling
+
+**Important:** LeRobot does **NOT** automatically scale learning rates or training steps based on the number of GPUs. This gives you full control over your training hyperparameters.
+
+### Why No Automatic Scaling?
+
+Many distributed training frameworks automatically scale the learning rate by the number of GPUs (e.g., `lr = base_lr × num_gpus`). 
+However, LeRobot keeps the learning rate exactly as you specify it.
+
+### When and How to Scale
+
+If you want to scale your hyperparameters when using multiple GPUs, you should do it manually:
+
+**Learning Rate Scaling:**
+
+```bash
+# Example: 2 GPUs with linear LR scaling
+# Base LR: 1e-4, with 2 GPUs -> 2e-4
+accelerate launch --num_processes=2 $(which lerobot-train) \
+  --optimizer.lr=2e-4 \
+  --dataset.repo_id=lerobot/pusht \
+  --policy=act
+```
+
+**Training Steps Scaling:**
+
+Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally:
+
+#TODO(pepijn): verify this (bs scaling)
+```bash
+# Example: 2 GPUs with effective batch size 2x larger
+# Original: batch_size=8, steps=100000  
+# With 2 GPUs: batch_size=8 (16 in total), steps=50000 
+accelerate launch --num_processes=2 $(which lerobot-train) \
+  --batch_size=8 \
+  --steps=50000 \
+  --dataset.repo_id=lerobot/pusht \
+  --policy=act
+```
+
 ## Notes
 
 - The `--policy.use_amp` flag in `lerobot-train` is only used when **not** running with accelerate. When using accelerate, mixed precision is controlled by accelerate's configuration.
@@ -98,4 +138,4 @@ For faster training, you can enable mixed precision (fp16 or bf16). This is conf
 - When saving or pushing models, LeRobot automatically unwraps the model from accelerate's distributed wrapper to ensure compatibility.
 - WandB integration automatically initializes only on the main process, preventing multiple runs from being created.
 
-For more advanced configurations and troubleshooting, see the [Accelerate documentation](https://huggingface.co/docs/accelerate).
+For more advanced configurations and troubleshooting, see the [Accelerate documentation](https://huggingface.co/docs/accelerate). If you want to learn more about how to train on a large number of GPUs, checkout this awesome guide: [Ultrascale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook).
diff --git a/tests/training/test_multi_gpu.py b/tests/training/test_multi_gpu.py
new file mode 100644
index 000000000..f0668a8ef
--- /dev/null
+++ b/tests/training/test_multi_gpu.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Multi-GPU Training Tests
+
+This module tests multi-GPU training functionality with accelerate.
+These tests are designed to run on machines with 2+ GPUs and are executed
+in the nightly CI workflow.
+
+The tests automatically generate accelerate configs and launch training
+with subprocess to properly test the distributed training environment.
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+import torch
+
+
+def get_num_available_gpus():
+    """Returns the number of available GPUs."""
+    if not torch.cuda.is_available():
+        return 0
+    return torch.cuda.device_count()
+
+
+def run_accelerate_training(config_args, num_processes=4, temp_dir=None):
+    """
+    Helper function to run training with accelerate launch.
+    
+    Args:
+        config_args: List of config arguments to pass to lerobot_train.py
+        num_processes: Number of processes (GPUs) to use
+        temp_dir: Temporary directory for outputs
+        
+    Returns:
+        subprocess.CompletedProcess result
+    """
+    # Create accelerate config
+    accelerate_config = {
+        "compute_environment": "LOCAL_MACHINE",
+        "distributed_type": "MULTI_GPU",
+        "mixed_precision": "no",
+        "num_processes": num_processes,
+        "use_cpu": False,
+        "gpu_ids": "all",
+    }
+    
+    config_path = Path(temp_dir) / "accelerate_config.yaml"
+    
+    # Write YAML config
+    with open(config_path, "w") as f:
+        f.write("compute_environment: LOCAL_MACHINE\n")
+        f.write("distributed_type: MULTI_GPU\n")
+        f.write("mixed_precision: 'no'\n")
+        f.write(f"num_processes: {num_processes}\n")
+        f.write("use_cpu: false\n")
+        f.write("gpu_ids: all\n")
+        f.write("downcast_bf16: 'no'\n")
+        f.write("machine_rank: 0\n")
+        f.write("main_training_function: main\n")
+        f.write("num_machines: 1\n")
+        f.write("rdzv_backend: static\n")
+        f.write("same_network: true\n")
+    
+    cmd = [
+        "accelerate",
+        "launch",
+        "--config_file",
+        str(config_path),
+        "-m",
+        "lerobot.scripts.lerobot_train",
+    ] + config_args
+    
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+        env={**os.environ, "CUDA_VISIBLE_DEVICES": ",".join(map(str, range(num_processes)))},
+    )
+    
+    return result
+
+
+@pytest.mark.skipif(
+    get_num_available_gpus() < 2,
+    reason="Multi-GPU tests require at least 2 GPUs",
+)
+class TestMultiGPUTraining:
+    """Test suite for multi-GPU training functionality."""
+
+    def test_basic_multi_gpu_training(self):
+        """
+        Test that basic multi-GPU training runs successfully.
+        Verifies that the training completes without errors.
+        """
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_dir = Path(temp_dir) / "outputs"
+            
+            config_args = [
+                "--dataset.repo_id=lerobot/pusht",
+                "--dataset.episodes=[0]",
+                "--policy=act",
+                "--policy.device=cuda",
+                f"--output_dir={output_dir}",
+                "--batch_size=4",
+                "--steps=10",
+                "--eval_freq=-1",
+                "--log_freq=5",
+                "--save_freq=10",
+                "--seed=42",
+            ]
+            
+            result = run_accelerate_training(config_args, num_processes=4, temp_dir=temp_dir)
+            
+            # Check that training completed successfully
+            assert result.returncode == 0, (
+                f"Multi-GPU training failed with return code {result.returncode}\n"
+                f"STDOUT:\n{result.stdout}\n"
+                f"STDERR:\n{result.stderr}"
+            )
+            
+            # Verify checkpoint was saved
+            checkpoints_dir = output_dir / "checkpoints"
+            assert checkpoints_dir.exists(), "Checkpoints directory was not created"
+            
+            # Verify that training completed
+            assert "End of training" in result.stdout or "End of training" in result.stderr
+
+    def test_checkpoint_saving_multi_gpu(self):
+        """
+        Test that checkpoints are correctly saved during multi-GPU training.
+        Only the main process (rank 0) should save checkpoints.
+        """
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_dir = Path(temp_dir) / "outputs"
+            
+            config_args = [
+                "--dataset.repo_id=lerobot/pusht",
+                "--dataset.episodes=[0]",
+                "--policy=act",
+                "--policy.device=cuda",
+                f"--output_dir={output_dir}",
+                "--batch_size=4",
+                "--steps=20",
+                "--eval_freq=-1",
+                "--log_freq=5",
+                "--save_freq=10",
+                "--seed=42",
+            ]
+            
+            result = run_accelerate_training(config_args, num_processes=2, temp_dir=temp_dir)
+            
+            assert result.returncode == 0, (
+                f"Training failed:\nSTDOUT:\n{result.stdout}\n\nSTDERR:\n{result.stderr}"
+            )
+            
+            # Verify checkpoint directory exists
+            checkpoints_dir = output_dir / "checkpoints"
+            assert checkpoints_dir.exists(), "Checkpoints directory not created"
+            
+            # Count checkpoint directories (should have checkpoint at step 10 and 20)
+            checkpoint_dirs = [d for d in checkpoints_dir.iterdir() if d.is_dir()]
+            assert len(checkpoint_dirs) >= 1, f"Expected at least 1 checkpoint, found {len(checkpoint_dirs)}"
+            
+            # Verify checkpoint contents
+            for checkpoint_dir in checkpoint_dirs:
+                # Check for model files
+                model_files = list(checkpoint_dir.rglob("*.safetensors"))
+                assert len(model_files) > 0, f"No model files in checkpoint {checkpoint_dir}"
+                
+                # Check for training state
+                training_state_dir = checkpoint_dir / "training_state"
+                assert training_state_dir.exists(), f"No training state in checkpoint {checkpoint_dir}"
+                
+                # Verify optimizer state exists
+                optimizer_state = training_state_dir / "optimizer_state.pt"
+                assert optimizer_state.exists(), f"No optimizer state in checkpoint {checkpoint_dir}"