From 3174e14bc02ec8c1fa5a3dbe899774355fe88197 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 18 May 2026 14:46:38 +0200 Subject: [PATCH] fix(smolvla2): feed all cameras to VQA generation, not just the chosen one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit handle_vqa_query filtered the observation down to the single chosen camera before calling the VLM. But training feeds every camera: the ask_vqa_* recipes' image blocks are stripped before tokenization and the frames reach the model via OBS_IMAGES_*, where embed_prefix consumes all config.image_features regardless of the per-camera recipe tag. Filtering to one camera changed the image-token count in the prefix (the dropped camera zero-padded with mask=0) — a prefix shape the model never saw at training. Now the full observation is passed to select_message; the chosen camera is used only to pick which frame the bbox/point overlay is drawn on. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../policies/smolvla2/inference/vqa.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/lerobot/policies/smolvla2/inference/vqa.py b/src/lerobot/policies/smolvla2/inference/vqa.py index bdf345214..b4139083c 100644 --- a/src/lerobot/policies/smolvla2/inference/vqa.py +++ b/src/lerobot/policies/smolvla2/inference/vqa.py @@ -310,19 +310,20 @@ def handle_vqa_query( else: report(" [info] vqa: no camera available — answering text-only") - # Ground the question on the chosen camera only — filter the - # observation to that one image (+ proprio state) so the VLM - # prefix matches the single-image ``ask_vqa_*`` training recipe. - vqa_obs: dict | None = None - if observation is not None and chosen is not None: - vqa_obs = {chosen: observation[chosen]} - if "observation.state" in observation: - vqa_obs["observation.state"] = observation["observation.state"] - + # Feed the FULL observation (every camera + state) to the VLM. The + # ``ask_vqa_*`` recipes look single-camera, but the image *block* is + # stripped before tokenization — the actual frames reach the model + # via SmolVLA's ``OBS_IMAGES_*`` channels, and ``embed_prefix`` + # consumes *all* ``config.image_features`` regardless of which + # camera the sub-recipe was tagged for. So training always sees + # every camera; filtering to one here would change the image-token + # count in the prefix (the dropped camera gets zero-padded with + # mask=0) — a prefix shape the model never saw. The chosen camera + # is used only to pick which frame the overlay is drawn on. answer = _generate_with_policy( policy, _msgs_for_vqa(question), - observation=vqa_obs, + observation=observation, state=state, label="vqa gen", )