diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx index a1318e772..89ab2a6dc 100644 --- a/docs/source/multi_gpu_training.mdx +++ b/docs/source/multi_gpu_training.mdx @@ -102,7 +102,6 @@ accelerate launch --num_processes=2 $(which lerobot-train) \ Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally: -#TODO(pepijn): verify this (bs scaling) ```bash # Example: 2 GPUs with effective batch size 2x larger # Original: batch_size=8, steps=100000 diff --git a/src/lerobot/optim/schedulers.py b/src/lerobot/optim/schedulers.py index 55ee62e40..e62867841 100644 --- a/src/lerobot/optim/schedulers.py +++ b/src/lerobot/optim/schedulers.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import abc +import logging import math from dataclasses import asdict, dataclass from pathlib import Path @@ -79,7 +80,11 @@ class VQBeTSchedulerConfig(LRSchedulerConfig): @LRSchedulerConfig.register_subclass("cosine_decay_with_warmup") @dataclass class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig): - """Used by Physical Intelligence to train Pi0""" + """Used by Physical Intelligence to train Pi0. + + Automatically scales warmup and decay steps if num_training_steps < num_decay_steps. + This ensures the learning rate schedule completes properly even with shorter training runs. + """ num_warmup_steps: int num_decay_steps: int @@ -87,23 +92,39 @@ class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig): decay_lr: float def build(self, optimizer: Optimizer, num_training_steps: int) -> LambdaLR: - del num_training_steps + # Auto-scale scheduler parameters if training steps are shorter than configured decay steps + actual_warmup_steps = self.num_warmup_steps + actual_decay_steps = self.num_decay_steps + + if num_training_steps < self.num_decay_steps: + # Calculate scaling factor to fit the schedule into the available training steps + scale_factor = num_training_steps / self.num_decay_steps + actual_warmup_steps = int(self.num_warmup_steps * scale_factor) + actual_decay_steps = num_training_steps + + logging.info( + f"Auto-scaling LR scheduler: " + f"num_training_steps ({num_training_steps}) < num_decay_steps ({self.num_decay_steps}). " + f"Scaling warmup: {self.num_warmup_steps} → {actual_warmup_steps}, " + f"decay: {self.num_decay_steps} → {actual_decay_steps} " + f"(scale factor: {scale_factor:.3f})" + ) def lr_lambda(current_step): def linear_warmup_schedule(current_step): if current_step <= 0: - return 1 / (self.num_warmup_steps + 1) - frac = 1 - current_step / self.num_warmup_steps - return (1 / (self.num_warmup_steps + 1) - 1) * frac + 1 + return 1 / (actual_warmup_steps + 1) + frac = 1 - current_step / actual_warmup_steps + return (1 / (actual_warmup_steps + 1) - 1) * frac + 1 def cosine_decay_schedule(current_step): - step = min(current_step, self.num_decay_steps) - cosine_decay = 0.5 * (1 + math.cos(math.pi * step / self.num_decay_steps)) + step = min(current_step, actual_decay_steps) + cosine_decay = 0.5 * (1 + math.cos(math.pi * step / actual_decay_steps)) alpha = self.decay_lr / self.peak_lr decayed = (1 - alpha) * cosine_decay + alpha return decayed - if current_step < self.num_warmup_steps: + if current_step < actual_warmup_steps: return linear_warmup_schedule(current_step) return cosine_decay_schedule(current_step) diff --git a/src/lerobot/policies/pi0/configuration_pi0.py b/src/lerobot/policies/pi0/configuration_pi0.py index cc1cda9d8..d745f4317 100644 --- a/src/lerobot/policies/pi0/configuration_pi0.py +++ b/src/lerobot/policies/pi0/configuration_pi0.py @@ -75,6 +75,8 @@ class PI0Config(PreTrainedConfig): optimizer_grad_clip_norm: float = 1.0 # Scheduler settings: see openpi `CosineDecaySchedule` + # Note: These will auto-scale if --steps < scheduler_decay_steps + # For example, --steps=3000 will scale warmup to 100 and decay to 3000 scheduler_warmup_steps: int = 1_000 scheduler_decay_steps: int = 30_000 scheduler_decay_lr: float = 2.5e-6 diff --git a/src/lerobot/policies/pi05/configuration_pi05.py b/src/lerobot/policies/pi05/configuration_pi05.py index 7c1e950b0..61346c330 100644 --- a/src/lerobot/policies/pi05/configuration_pi05.py +++ b/src/lerobot/policies/pi05/configuration_pi05.py @@ -75,6 +75,8 @@ class PI05Config(PreTrainedConfig): optimizer_grad_clip_norm: float = 1.0 # Scheduler settings: see openpi `CosineDecaySchedule` + # Note: These will auto-scale if --steps < scheduler_decay_steps + # For example, --steps=3000 will scale warmup to 100 and decay to 3000 scheduler_warmup_steps: int = 1_000 scheduler_decay_steps: int = 30_000 scheduler_decay_lr: float = 2.5e-6 diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 3cf99b6ef..663934d66 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -431,6 +431,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): unwrapped_policy.push_model_to_hub(cfg) preprocessor.push_to_hub(cfg.policy.repo_id) postprocessor.push_to_hub(cfg.policy.repo_id) + + # Properly clean up the distributed process group + accelerator.wait_for_everyone() + accelerator.end_training() def main():