scale lr decay if we reduce steps

This commit is contained in:
Pepijn
2025-10-14 15:59:46 +02:00
parent 9950bfd66f
commit a66b50d372
5 changed files with 37 additions and 9 deletions
-1
View File
@@ -102,7 +102,6 @@ accelerate launch --num_processes=2 $(which lerobot-train) \
Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally: Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally:
#TODO(pepijn): verify this (bs scaling)
```bash ```bash
# Example: 2 GPUs with effective batch size 2x larger # Example: 2 GPUs with effective batch size 2x larger
# Original: batch_size=8, steps=100000 # Original: batch_size=8, steps=100000
+29 -8
View File
@@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import abc import abc
import logging
import math import math
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from pathlib import Path from pathlib import Path
@@ -79,7 +80,11 @@ class VQBeTSchedulerConfig(LRSchedulerConfig):
@LRSchedulerConfig.register_subclass("cosine_decay_with_warmup") @LRSchedulerConfig.register_subclass("cosine_decay_with_warmup")
@dataclass @dataclass
class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig): class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig):
"""Used by Physical Intelligence to train Pi0""" """Used by Physical Intelligence to train Pi0.
Automatically scales warmup and decay steps if num_training_steps < num_decay_steps.
This ensures the learning rate schedule completes properly even with shorter training runs.
"""
num_warmup_steps: int num_warmup_steps: int
num_decay_steps: int num_decay_steps: int
@@ -87,23 +92,39 @@ class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig):
decay_lr: float decay_lr: float
def build(self, optimizer: Optimizer, num_training_steps: int) -> LambdaLR: def build(self, optimizer: Optimizer, num_training_steps: int) -> LambdaLR:
del num_training_steps # Auto-scale scheduler parameters if training steps are shorter than configured decay steps
actual_warmup_steps = self.num_warmup_steps
actual_decay_steps = self.num_decay_steps
if num_training_steps < self.num_decay_steps:
# Calculate scaling factor to fit the schedule into the available training steps
scale_factor = num_training_steps / self.num_decay_steps
actual_warmup_steps = int(self.num_warmup_steps * scale_factor)
actual_decay_steps = num_training_steps
logging.info(
f"Auto-scaling LR scheduler: "
f"num_training_steps ({num_training_steps}) < num_decay_steps ({self.num_decay_steps}). "
f"Scaling warmup: {self.num_warmup_steps}{actual_warmup_steps}, "
f"decay: {self.num_decay_steps}{actual_decay_steps} "
f"(scale factor: {scale_factor:.3f})"
)
def lr_lambda(current_step): def lr_lambda(current_step):
def linear_warmup_schedule(current_step): def linear_warmup_schedule(current_step):
if current_step <= 0: if current_step <= 0:
return 1 / (self.num_warmup_steps + 1) return 1 / (actual_warmup_steps + 1)
frac = 1 - current_step / self.num_warmup_steps frac = 1 - current_step / actual_warmup_steps
return (1 / (self.num_warmup_steps + 1) - 1) * frac + 1 return (1 / (actual_warmup_steps + 1) - 1) * frac + 1
def cosine_decay_schedule(current_step): def cosine_decay_schedule(current_step):
step = min(current_step, self.num_decay_steps) step = min(current_step, actual_decay_steps)
cosine_decay = 0.5 * (1 + math.cos(math.pi * step / self.num_decay_steps)) cosine_decay = 0.5 * (1 + math.cos(math.pi * step / actual_decay_steps))
alpha = self.decay_lr / self.peak_lr alpha = self.decay_lr / self.peak_lr
decayed = (1 - alpha) * cosine_decay + alpha decayed = (1 - alpha) * cosine_decay + alpha
return decayed return decayed
if current_step < self.num_warmup_steps: if current_step < actual_warmup_steps:
return linear_warmup_schedule(current_step) return linear_warmup_schedule(current_step)
return cosine_decay_schedule(current_step) return cosine_decay_schedule(current_step)
@@ -75,6 +75,8 @@ class PI0Config(PreTrainedConfig):
optimizer_grad_clip_norm: float = 1.0 optimizer_grad_clip_norm: float = 1.0
# Scheduler settings: see openpi `CosineDecaySchedule` # Scheduler settings: see openpi `CosineDecaySchedule`
# Note: These will auto-scale if --steps < scheduler_decay_steps
# For example, --steps=3000 will scale warmup to 100 and decay to 3000
scheduler_warmup_steps: int = 1_000 scheduler_warmup_steps: int = 1_000
scheduler_decay_steps: int = 30_000 scheduler_decay_steps: int = 30_000
scheduler_decay_lr: float = 2.5e-6 scheduler_decay_lr: float = 2.5e-6
@@ -75,6 +75,8 @@ class PI05Config(PreTrainedConfig):
optimizer_grad_clip_norm: float = 1.0 optimizer_grad_clip_norm: float = 1.0
# Scheduler settings: see openpi `CosineDecaySchedule` # Scheduler settings: see openpi `CosineDecaySchedule`
# Note: These will auto-scale if --steps < scheduler_decay_steps
# For example, --steps=3000 will scale warmup to 100 and decay to 3000
scheduler_warmup_steps: int = 1_000 scheduler_warmup_steps: int = 1_000
scheduler_decay_steps: int = 30_000 scheduler_decay_steps: int = 30_000
scheduler_decay_lr: float = 2.5e-6 scheduler_decay_lr: float = 2.5e-6
+4
View File
@@ -432,6 +432,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
preprocessor.push_to_hub(cfg.policy.repo_id) preprocessor.push_to_hub(cfg.policy.repo_id)
postprocessor.push_to_hub(cfg.policy.repo_id) postprocessor.push_to_hub(cfg.policy.repo_id)
# Properly clean up the distributed process group
accelerator.wait_for_everyone()
accelerator.end_training()
def main(): def main():
init_logging() init_logging()