From a48d4e32a1dee97949c42acb89c3b4252bda8ab6 Mon Sep 17 00:00:00 2001 From: pepijn223 Date: Thu, 4 Jun 2026 17:20:34 +0200 Subject: [PATCH] fix(pi05): don't scale image features by sqrt(hidden_size) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lerobot/pi05_base was trained in the OpenPI/big_vision regime where image (soft) tokens are NOT multiplied by the Gemma embedder normalizer (sqrt(hidden_size)) — only text tokens are. Scaling image features here over-scaled them ~45x, breaking the pretrained vision-language alignment and yielding ~0% closed-loop success on RoboCasa across all pi05 runs. Co-authored-by: Cursor --- src/lerobot/policies/pi05/modeling_pi05.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lerobot/policies/pi05/modeling_pi05.py b/src/lerobot/policies/pi05/modeling_pi05.py index dc5a26ed0..12d662c17 100644 --- a/src/lerobot/policies/pi05/modeling_pi05.py +++ b/src/lerobot/policies/pi05/modeling_pi05.py @@ -477,7 +477,11 @@ class PaliGemmaWithExpertModel( if image.dtype != torch.float32: image = image.to(torch.float32) image_outputs = self.paligemma.model.get_image_features(image) - features = image_outputs.pooler_output * self.paligemma.config.text_config.hidden_size**0.5 + # OpenPI / big_vision convention: image (soft) tokens are NOT scaled by the + # Gemma embedder normalizer (sqrt(hidden_size)) — only text tokens are. lerobot/pi05_base + # was trained in this regime, so scaling image features here over-scales them ~45x and + # breaks the pretrained vision-language alignment. Keep image features un-normalized. + features = image_outputs.pooler_output if features.dtype != out_dtype: features = features.to(out_dtype) return features