mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 12:09:42 +00:00
scale lr decay if we reduce steps
This commit is contained in:
@@ -102,7 +102,6 @@ accelerate launch --num_processes=2 $(which lerobot-train) \
|
|||||||
|
|
||||||
Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally:
|
Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally:
|
||||||
|
|
||||||
#TODO(pepijn): verify this (bs scaling)
|
|
||||||
```bash
|
```bash
|
||||||
# Example: 2 GPUs with effective batch size 2x larger
|
# Example: 2 GPUs with effective batch size 2x larger
|
||||||
# Original: batch_size=8, steps=100000
|
# Original: batch_size=8, steps=100000
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import abc
|
import abc
|
||||||
|
import logging
|
||||||
import math
|
import math
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -79,7 +80,11 @@ class VQBeTSchedulerConfig(LRSchedulerConfig):
|
|||||||
@LRSchedulerConfig.register_subclass("cosine_decay_with_warmup")
|
@LRSchedulerConfig.register_subclass("cosine_decay_with_warmup")
|
||||||
@dataclass
|
@dataclass
|
||||||
class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig):
|
class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig):
|
||||||
"""Used by Physical Intelligence to train Pi0"""
|
"""Used by Physical Intelligence to train Pi0.
|
||||||
|
|
||||||
|
Automatically scales warmup and decay steps if num_training_steps < num_decay_steps.
|
||||||
|
This ensures the learning rate schedule completes properly even with shorter training runs.
|
||||||
|
"""
|
||||||
|
|
||||||
num_warmup_steps: int
|
num_warmup_steps: int
|
||||||
num_decay_steps: int
|
num_decay_steps: int
|
||||||
@@ -87,23 +92,39 @@ class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig):
|
|||||||
decay_lr: float
|
decay_lr: float
|
||||||
|
|
||||||
def build(self, optimizer: Optimizer, num_training_steps: int) -> LambdaLR:
|
def build(self, optimizer: Optimizer, num_training_steps: int) -> LambdaLR:
|
||||||
del num_training_steps
|
# Auto-scale scheduler parameters if training steps are shorter than configured decay steps
|
||||||
|
actual_warmup_steps = self.num_warmup_steps
|
||||||
|
actual_decay_steps = self.num_decay_steps
|
||||||
|
|
||||||
|
if num_training_steps < self.num_decay_steps:
|
||||||
|
# Calculate scaling factor to fit the schedule into the available training steps
|
||||||
|
scale_factor = num_training_steps / self.num_decay_steps
|
||||||
|
actual_warmup_steps = int(self.num_warmup_steps * scale_factor)
|
||||||
|
actual_decay_steps = num_training_steps
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
f"Auto-scaling LR scheduler: "
|
||||||
|
f"num_training_steps ({num_training_steps}) < num_decay_steps ({self.num_decay_steps}). "
|
||||||
|
f"Scaling warmup: {self.num_warmup_steps} → {actual_warmup_steps}, "
|
||||||
|
f"decay: {self.num_decay_steps} → {actual_decay_steps} "
|
||||||
|
f"(scale factor: {scale_factor:.3f})"
|
||||||
|
)
|
||||||
|
|
||||||
def lr_lambda(current_step):
|
def lr_lambda(current_step):
|
||||||
def linear_warmup_schedule(current_step):
|
def linear_warmup_schedule(current_step):
|
||||||
if current_step <= 0:
|
if current_step <= 0:
|
||||||
return 1 / (self.num_warmup_steps + 1)
|
return 1 / (actual_warmup_steps + 1)
|
||||||
frac = 1 - current_step / self.num_warmup_steps
|
frac = 1 - current_step / actual_warmup_steps
|
||||||
return (1 / (self.num_warmup_steps + 1) - 1) * frac + 1
|
return (1 / (actual_warmup_steps + 1) - 1) * frac + 1
|
||||||
|
|
||||||
def cosine_decay_schedule(current_step):
|
def cosine_decay_schedule(current_step):
|
||||||
step = min(current_step, self.num_decay_steps)
|
step = min(current_step, actual_decay_steps)
|
||||||
cosine_decay = 0.5 * (1 + math.cos(math.pi * step / self.num_decay_steps))
|
cosine_decay = 0.5 * (1 + math.cos(math.pi * step / actual_decay_steps))
|
||||||
alpha = self.decay_lr / self.peak_lr
|
alpha = self.decay_lr / self.peak_lr
|
||||||
decayed = (1 - alpha) * cosine_decay + alpha
|
decayed = (1 - alpha) * cosine_decay + alpha
|
||||||
return decayed
|
return decayed
|
||||||
|
|
||||||
if current_step < self.num_warmup_steps:
|
if current_step < actual_warmup_steps:
|
||||||
return linear_warmup_schedule(current_step)
|
return linear_warmup_schedule(current_step)
|
||||||
|
|
||||||
return cosine_decay_schedule(current_step)
|
return cosine_decay_schedule(current_step)
|
||||||
|
|||||||
@@ -75,6 +75,8 @@ class PI0Config(PreTrainedConfig):
|
|||||||
optimizer_grad_clip_norm: float = 1.0
|
optimizer_grad_clip_norm: float = 1.0
|
||||||
|
|
||||||
# Scheduler settings: see openpi `CosineDecaySchedule`
|
# Scheduler settings: see openpi `CosineDecaySchedule`
|
||||||
|
# Note: These will auto-scale if --steps < scheduler_decay_steps
|
||||||
|
# For example, --steps=3000 will scale warmup to 100 and decay to 3000
|
||||||
scheduler_warmup_steps: int = 1_000
|
scheduler_warmup_steps: int = 1_000
|
||||||
scheduler_decay_steps: int = 30_000
|
scheduler_decay_steps: int = 30_000
|
||||||
scheduler_decay_lr: float = 2.5e-6
|
scheduler_decay_lr: float = 2.5e-6
|
||||||
|
|||||||
@@ -75,6 +75,8 @@ class PI05Config(PreTrainedConfig):
|
|||||||
optimizer_grad_clip_norm: float = 1.0
|
optimizer_grad_clip_norm: float = 1.0
|
||||||
|
|
||||||
# Scheduler settings: see openpi `CosineDecaySchedule`
|
# Scheduler settings: see openpi `CosineDecaySchedule`
|
||||||
|
# Note: These will auto-scale if --steps < scheduler_decay_steps
|
||||||
|
# For example, --steps=3000 will scale warmup to 100 and decay to 3000
|
||||||
scheduler_warmup_steps: int = 1_000
|
scheduler_warmup_steps: int = 1_000
|
||||||
scheduler_decay_steps: int = 30_000
|
scheduler_decay_steps: int = 30_000
|
||||||
scheduler_decay_lr: float = 2.5e-6
|
scheduler_decay_lr: float = 2.5e-6
|
||||||
|
|||||||
@@ -432,6 +432,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
|
|||||||
preprocessor.push_to_hub(cfg.policy.repo_id)
|
preprocessor.push_to_hub(cfg.policy.repo_id)
|
||||||
postprocessor.push_to_hub(cfg.policy.repo_id)
|
postprocessor.push_to_hub(cfg.policy.repo_id)
|
||||||
|
|
||||||
|
# Properly clean up the distributed process group
|
||||||
|
accelerator.wait_for_everyone()
|
||||||
|
accelerator.end_training()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
init_logging()
|
init_logging()
|
||||||
|
|||||||
Reference in New Issue
Block a user