diff --git a/src/lerobot/policies/smolvla2/inference/vqa.py b/src/lerobot/policies/smolvla2/inference/vqa.py
index b4139083c..4d68c4d0f 100644
--- a/src/lerobot/policies/smolvla2/inference/vqa.py
+++ b/src/lerobot/policies/smolvla2/inference/vqa.py
@@ -81,7 +81,7 @@ def prompt_camera_choice(
     input_fn: Any = input,
     print_fn: Any = print,
 ) -> str | None:
-    """Ask the operator which camera to ground a VQA question on.
+    """Ask the operator which camera frame to draw a VQA overlay on.
 
     Accepts either the menu number or the (short or full) camera name.
     A single-camera setup auto-selects without prompting. Returns the
@@ -92,7 +92,7 @@ def prompt_camera_choice(
         return None
     if len(cameras) == 1:
         return cameras[0]
-    print_fn("Which camera should I look at?")
+    print_fn("Draw the result on which camera?")
     for i, cam in enumerate(cameras, 1):
         print_fn(f"  [{i}] {camera_short_name(cam)}")
     try:
@@ -299,27 +299,13 @@ def handle_vqa_query(
         except Exception as exc:  # noqa: BLE001
             logger.debug("observation_provider raised %s", exc)
 
-    cameras = available_cameras(observation)
-    chosen: str | None = None
-    if cameras:
-        chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn)
-        if chosen is None:
-            report("  [info] vqa cancelled — no camera selected")
-            return
-        report(f"  vqa camera: {camera_short_name(chosen)}")
-    else:
-        report("  [info] vqa: no camera available — answering text-only")
-
     # Feed the FULL observation (every camera + state) to the VLM. The
     # ``ask_vqa_*`` recipes look single-camera, but the image *block* is
     # stripped before tokenization — the actual frames reach the model
     # via SmolVLA's ``OBS_IMAGES_*`` channels, and ``embed_prefix``
     # consumes *all* ``config.image_features`` regardless of which
-    # camera the sub-recipe was tagged for. So training always sees
-    # every camera; filtering to one here would change the image-token
-    # count in the prefix (the dropped camera gets zero-padded with
-    # mask=0) — a prefix shape the model never saw. The chosen camera
-    # is used only to pick which frame the overlay is drawn on.
+    # camera the sub-recipe was tagged for. So the model always sees
+    # every camera; the operator never has to name one to ask.
     answer = _generate_with_policy(
         policy,
         _msgs_for_vqa(question),
@@ -337,14 +323,24 @@ def handle_vqa_query(
         if parsed is None:
             report("  [info] vqa answer is not JSON — no overlay")
         return
-    if observation is None or chosen is None:
+
+    # The answer carries a bounding box / point. Its pixel coordinates
+    # are camera-specific and the text answer doesn't say which camera,
+    # so ask the operator *now* — only when there is actually something
+    # to draw — which camera frame to render the overlay on.
+    cameras = available_cameras(observation)
+    if observation is None or not cameras:
         report("  [info] no camera image — cannot draw overlay")
         return
+    chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn)
+    if chosen is None:
+        report("  [info] overlay skipped — no camera selected")
+        return
     try:
         pil = observation_image_to_pil(observation[chosen])
         overlay = draw_vqa_overlay(pil, parsed)
         path = save_and_open_overlay(overlay)
-        report(f"  vqa overlay saved: {path}")
+        report(f"  vqa overlay ({camera_short_name(chosen)}) saved: {path}")
     except Exception as exc:  # noqa: BLE001
         logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG))
         report(f"  [warn] vqa overlay failed: {type(exc).__name__}: {exc}")