From 3174e14bc02ec8c1fa5a3dbe899774355fe88197 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Mon, 18 May 2026 14:46:38 +0200
Subject: [PATCH] fix(smolvla2): feed all cameras to VQA generation, not just
 the chosen one
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

handle_vqa_query filtered the observation down to the single chosen
camera before calling the VLM. But training feeds every camera: the
ask_vqa_* recipes' image blocks are stripped before tokenization and
the frames reach the model via OBS_IMAGES_*, where embed_prefix
consumes all config.image_features regardless of the per-camera recipe
tag. Filtering to one camera changed the image-token count in the
prefix (the dropped camera zero-padded with mask=0) — a prefix shape
the model never saw at training.

Now the full observation is passed to select_message; the chosen
camera is used only to pick which frame the bbox/point overlay is
drawn on.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../policies/smolvla2/inference/vqa.py        | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/lerobot/policies/smolvla2/inference/vqa.py b/src/lerobot/policies/smolvla2/inference/vqa.py
index bdf345214..b4139083c 100644
--- a/src/lerobot/policies/smolvla2/inference/vqa.py
+++ b/src/lerobot/policies/smolvla2/inference/vqa.py
@@ -310,19 +310,20 @@ def handle_vqa_query(
     else:
         report("  [info] vqa: no camera available — answering text-only")
 
-    # Ground the question on the chosen camera only — filter the
-    # observation to that one image (+ proprio state) so the VLM
-    # prefix matches the single-image ``ask_vqa_*`` training recipe.
-    vqa_obs: dict | None = None
-    if observation is not None and chosen is not None:
-        vqa_obs = {chosen: observation[chosen]}
-        if "observation.state" in observation:
-            vqa_obs["observation.state"] = observation["observation.state"]
-
+    # Feed the FULL observation (every camera + state) to the VLM. The
+    # ``ask_vqa_*`` recipes look single-camera, but the image *block* is
+    # stripped before tokenization — the actual frames reach the model
+    # via SmolVLA's ``OBS_IMAGES_*`` channels, and ``embed_prefix``
+    # consumes *all* ``config.image_features`` regardless of which
+    # camera the sub-recipe was tagged for. So training always sees
+    # every camera; filtering to one here would change the image-token
+    # count in the prefix (the dropped camera gets zero-padded with
+    # mask=0) — a prefix shape the model never saw. The chosen camera
+    # is used only to pick which frame the overlay is drawn on.
     answer = _generate_with_policy(
         policy,
         _msgs_for_vqa(question),
-        observation=vqa_obs,
+        observation=observation,
         state=state,
         label="vqa gen",
     )