From 8ba3b187a15dcef4a0b4e1b3e9e1a38cef76e44b Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 25 May 2026 20:57:43 +0200 Subject: [PATCH] pi052: bump lm_head_lr_scale default to 5.0 (keep base LR at 2.5e-5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The base optimizer LR (2.5e-5, cosine to 2.5e-6, 1k warmup, AdamW (0.9, 0.95), wd 0.01, grad_clip 1.0) is the openpi/π0.5 setting used for the RoboCasa leaderboard baselines and is well-validated for 3B- class VLAs with a paligemma backbone. Leave it alone. The one place pi052 needs to diverge from pi05 is the LM-head LR multiplier: * pi05 has no text supervision -> head doesn't get gradients -> lm_head_lr_scale is moot, stays at 1.0. * pi052 always has text supervision via the recipe (subtask / memory / VQA). Under KI, the LM head only sees gradients on ~30-45% of the batch (the text-CE mask share). Under aggressive cosine decay the head drifts back toward PaliGemma's pretrained first-token bias, despite teacher-forced CE staying near 0. 5x is the documented fix (see PI05Config.lm_head_lr_scale docstring and PI05Policy.get_optim_params, which is already wired to split the LM head + tied embed_tokens into their own param group while sharing the same cosine lambda). Flipping the default here lifts the fix from opt-in to on-by-default for every pi052 run, with zero downside on text-free recipes (head still gets no gradients to scale). Other LR knobs reviewed and intentionally NOT changed: - optimizer_lr=2.5e-5: openpi-validated, matches leaderboard. - scheduler_warmup_steps=1000: standard for VLA finetuning. - scheduler_decay_steps=30000: auto-scales for short runs. - optimizer_betas=(0.9, 0.95): GPT/LLM convention, works for flow-matching + LM-CE. - optimizer_weight_decay=0.01, grad_clip=1.0: standard. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../policies/pi052/configuration_pi052.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/lerobot/policies/pi052/configuration_pi052.py b/src/lerobot/policies/pi052/configuration_pi052.py index 5b4c25924..84a570c67 100644 --- a/src/lerobot/policies/pi052/configuration_pi052.py +++ b/src/lerobot/policies/pi052/configuration_pi052.py @@ -163,6 +163,23 @@ class PI052Config(PI05Config): """If True, route every transformer layer through the KI attention path that blocks action→VLM gradient flow on K/V.""" + # Learning-rate defaults -------------------------------------------- + # pi052 inherits π0.5's openpi-validated optimizer config (peak LR + # 2.5e-5, cosine→2.5e-6, 1k warmup, AdamW (0.9, 0.95), wd=0.01, + # grad_clip=1.0). The only place pi052 needs to diverge from pi05 + # is the LM-head LR multiplier: pi05 has no text supervision so the + # head doesn't get gradients; pi052 always has text supervision + # (subtask / memory / VQA) via the recipe, and under KI the LM head + # only sees gradients on ~30–45% of the batch (the text-CE mask + # share of the recipe). Under aggressive cosine decay this is too + # weak to keep the head pinned, so it drifts back toward PaliGemma's + # pretrained ```` first-token bias. 5x is the documented fix + # (see ``PI05Config.lm_head_lr_scale`` docstring); the wiring is + # already in ``PI05Policy.get_optim_params`` — it splits the LM head + # + tied ``embed_tokens`` into their own param group while sharing + # the same cosine lambda, so the 5x ratio is preserved across decay. + lm_head_lr_scale: float = 5.0 + def __post_init__(self) -> None: super().__post_init__() # Backbone needs gradients flowing through the text head when