diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx
index a1318e772..89ab2a6dc 100644
--- a/docs/source/multi_gpu_training.mdx
+++ b/docs/source/multi_gpu_training.mdx
@@ -102,7 +102,6 @@ accelerate launch --num_processes=2 $(which lerobot-train) \
 
 Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally:
 
-#TODO(pepijn): verify this (bs scaling)
 ```bash
 # Example: 2 GPUs with effective batch size 2x larger
 # Original: batch_size=8, steps=100000  
diff --git a/src/lerobot/optim/schedulers.py b/src/lerobot/optim/schedulers.py
index 55ee62e40..e62867841 100644
--- a/src/lerobot/optim/schedulers.py
+++ b/src/lerobot/optim/schedulers.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import abc
+import logging
 import math
 from dataclasses import asdict, dataclass
 from pathlib import Path
@@ -79,7 +80,11 @@ class VQBeTSchedulerConfig(LRSchedulerConfig):
 @LRSchedulerConfig.register_subclass("cosine_decay_with_warmup")
 @dataclass
 class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig):
-    """Used by Physical Intelligence to train Pi0"""
+    """Used by Physical Intelligence to train Pi0.
+    
+    Automatically scales warmup and decay steps if num_training_steps < num_decay_steps.
+    This ensures the learning rate schedule completes properly even with shorter training runs.
+    """
 
     num_warmup_steps: int
     num_decay_steps: int
@@ -87,23 +92,39 @@ class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig):
     decay_lr: float
 
     def build(self, optimizer: Optimizer, num_training_steps: int) -> LambdaLR:
-        del num_training_steps
+        # Auto-scale scheduler parameters if training steps are shorter than configured decay steps
+        actual_warmup_steps = self.num_warmup_steps
+        actual_decay_steps = self.num_decay_steps
+        
+        if num_training_steps < self.num_decay_steps:
+            # Calculate scaling factor to fit the schedule into the available training steps
+            scale_factor = num_training_steps / self.num_decay_steps
+            actual_warmup_steps = int(self.num_warmup_steps * scale_factor)
+            actual_decay_steps = num_training_steps
+            
+            logging.info(
+                f"Auto-scaling LR scheduler: "
+                f"num_training_steps ({num_training_steps}) < num_decay_steps ({self.num_decay_steps}). "
+                f"Scaling warmup: {self.num_warmup_steps} → {actual_warmup_steps}, "
+                f"decay: {self.num_decay_steps} → {actual_decay_steps} "
+                f"(scale factor: {scale_factor:.3f})"
+            )
 
         def lr_lambda(current_step):
             def linear_warmup_schedule(current_step):
                 if current_step <= 0:
-                    return 1 / (self.num_warmup_steps + 1)
-                frac = 1 - current_step / self.num_warmup_steps
-                return (1 / (self.num_warmup_steps + 1) - 1) * frac + 1
+                    return 1 / (actual_warmup_steps + 1)
+                frac = 1 - current_step / actual_warmup_steps
+                return (1 / (actual_warmup_steps + 1) - 1) * frac + 1
 
             def cosine_decay_schedule(current_step):
-                step = min(current_step, self.num_decay_steps)
-                cosine_decay = 0.5 * (1 + math.cos(math.pi * step / self.num_decay_steps))
+                step = min(current_step, actual_decay_steps)
+                cosine_decay = 0.5 * (1 + math.cos(math.pi * step / actual_decay_steps))
                 alpha = self.decay_lr / self.peak_lr
                 decayed = (1 - alpha) * cosine_decay + alpha
                 return decayed
 
-            if current_step < self.num_warmup_steps:
+            if current_step < actual_warmup_steps:
                 return linear_warmup_schedule(current_step)
 
             return cosine_decay_schedule(current_step)
diff --git a/src/lerobot/policies/pi0/configuration_pi0.py b/src/lerobot/policies/pi0/configuration_pi0.py
index cc1cda9d8..d745f4317 100644
--- a/src/lerobot/policies/pi0/configuration_pi0.py
+++ b/src/lerobot/policies/pi0/configuration_pi0.py
@@ -75,6 +75,8 @@ class PI0Config(PreTrainedConfig):
     optimizer_grad_clip_norm: float = 1.0
 
     # Scheduler settings: see openpi `CosineDecaySchedule`
+    # Note: These will auto-scale if --steps < scheduler_decay_steps
+    # For example, --steps=3000 will scale warmup to 100 and decay to 3000
     scheduler_warmup_steps: int = 1_000
     scheduler_decay_steps: int = 30_000
     scheduler_decay_lr: float = 2.5e-6
diff --git a/src/lerobot/policies/pi05/configuration_pi05.py b/src/lerobot/policies/pi05/configuration_pi05.py
index 7c1e950b0..61346c330 100644
--- a/src/lerobot/policies/pi05/configuration_pi05.py
+++ b/src/lerobot/policies/pi05/configuration_pi05.py
@@ -75,6 +75,8 @@ class PI05Config(PreTrainedConfig):
     optimizer_grad_clip_norm: float = 1.0
 
     # Scheduler settings: see openpi `CosineDecaySchedule`
+    # Note: These will auto-scale if --steps < scheduler_decay_steps
+    # For example, --steps=3000 will scale warmup to 100 and decay to 3000
     scheduler_warmup_steps: int = 1_000
     scheduler_decay_steps: int = 30_000
     scheduler_decay_lr: float = 2.5e-6
diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py
index 3cf99b6ef..663934d66 100644
--- a/src/lerobot/scripts/lerobot_train.py
+++ b/src/lerobot/scripts/lerobot_train.py
@@ -431,6 +431,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
             unwrapped_policy.push_model_to_hub(cfg)
             preprocessor.push_to_hub(cfg.policy.repo_id)
             postprocessor.push_to_hub(cfg.policy.repo_id)
+    
+    # Properly clean up the distributed process group
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
 
 
 def main():