diff --git a/src/lerobot/policies/smolvla2/inference/vqa.py b/src/lerobot/policies/smolvla2/inference/vqa.py index bdf345214..b4139083c 100644 --- a/src/lerobot/policies/smolvla2/inference/vqa.py +++ b/src/lerobot/policies/smolvla2/inference/vqa.py @@ -310,19 +310,20 @@ def handle_vqa_query( else: report(" [info] vqa: no camera available — answering text-only") - # Ground the question on the chosen camera only — filter the - # observation to that one image (+ proprio state) so the VLM - # prefix matches the single-image ``ask_vqa_*`` training recipe. - vqa_obs: dict | None = None - if observation is not None and chosen is not None: - vqa_obs = {chosen: observation[chosen]} - if "observation.state" in observation: - vqa_obs["observation.state"] = observation["observation.state"] - + # Feed the FULL observation (every camera + state) to the VLM. The + # ``ask_vqa_*`` recipes look single-camera, but the image *block* is + # stripped before tokenization — the actual frames reach the model + # via SmolVLA's ``OBS_IMAGES_*`` channels, and ``embed_prefix`` + # consumes *all* ``config.image_features`` regardless of which + # camera the sub-recipe was tagged for. So training always sees + # every camera; filtering to one here would change the image-token + # count in the prefix (the dropped camera gets zero-padded with + # mask=0) — a prefix shape the model never saw. The chosen camera + # is used only to pick which frame the overlay is drawn on. answer = _generate_with_policy( policy, _msgs_for_vqa(question), - observation=vqa_obs, + observation=observation, state=state, label="vqa gen", )