From d22fa47ac0dec789047ee05c11e6af625f611797 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng <54488861+2toinf@users.noreply.github.com> Date: Fri, 28 Nov 2025 04:12:04 +0800 Subject: [PATCH] Enhance X-VLA finetuning documentation with optimizer details (#2537) Added detailed instructions for implementing a custom optimizer and modifying parameter retrieval for X-VLA finetuning. Signed-off-by: Jinliang Zheng <54488861+2toinf@users.noreply.github.com> --- docs/source/xvla.mdx | 70 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/docs/source/xvla.mdx b/docs/source/xvla.mdx index 1509ac46d..06cce280d 100644 --- a/docs/source/xvla.mdx +++ b/docs/source/xvla.mdx @@ -157,6 +157,76 @@ lerobot-train \ --policy.train_soft_prompts=True ``` + + +💡 **Best Performance:** If you have sufficient computational resources and want to achieve best X-VLA finetuning performance, you should follow the official finetuning strategy: + +**🔥 Full-finetune all components with a custom learning-rate scheme** + +To ensure stable optimization, the Vision-Language Model (VLM) must be trained with only 1/10 of the base learning rate, while all other components use the full LR. +This LR ratio is crucial for achieving strong and stable finetuning performance. +To enable this behavior, you must: +1. Implement a custom optimizer and register it in your training config +``` +from dataclasses import dataclass, asdict +from lerobot.optim.optimizers import OptimizerConfig +import torch + +@OptimizerConfig.register_subclass("xvla-adamw") +@dataclass +class XVLAAdamW(OptimizerConfig): + lr: float = 1e-4 + betas: tuple[float, float] = (0.9, 0.99) + eps: float = 1e-8 + weight_decay: float = 0.0 + grad_clip_norm: float = 10.0 + + def build(self, params: dict) -> torch.optim.Optimizer: + """ + Expect `named_parameters()` as input. + Apply lr = lr / 10 for all VLM-related parameters. + """ + assert isinstance(params, dict), \ + "Custom LR optimizer requires `named_parameters()` as inputs." + kwargs = asdict(self) + kwargs.pop("grad_clip_norm") + vlm_group, other_group = [], [] + for name, p in params.items(): + if not p.requires_grad: + continue + if "vlm" in name.lower(): + vlm_group.append(p) + else: + other_group.append(p) + + param_groups = [ + {"params": vlm_group, "lr": self.lr * 0.1, "weight_decay": self.weight_decay * 0.1}, + {"params": other_group, "lr": self.lr, "weight_decay": self.weight_decay}, + ] + + return torch.optim.AdamW(param_groups, **kwargs) +``` +2. Modify X-VLA’s get_optim_params to return named parameters + +Replace: +``` +def get_optim_params(self) -> dict: + """Return only trainable parameters for optimization.""" + return filter(lambda p: p.requires_grad, self.parameters()) +``` +with: +``` +def get_optim_params(self): + """Return trainable named parameters.""" + return filter(lambda kv: kv[1].requires_grad, self.named_parameters()) +``` +This ensures the optimizer receives a dict of named parameters, allowing it to correctly detect VLM modules and apply the 1/10 LR rule. + +❕Note + +Completely matching the official reported performance may require an additional warm-up LR schedule for soft-prompts, which can bring minor improvements. +We encourage implementing this in your customized training pipeline for optimal results. + --- ## Core Concepts