From 15229468d0dca89f863c475bf5a41d10b983c374 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Mon, 18 May 2026 14:17:03 +0200
Subject: [PATCH] feat(smolvla2): startup mode prompt; rename /vlm mode to
 /question

Add a mode prompt at startup, shown before the task picker, so the
operator chooses action (run the robot) vs question (VQA only) up front
instead of having to discover /vlm mid-run.

Also rename the VQA mode from "vlm" to the clearer "question":
- state["mode"] value is now "action" | "question"
- the command is /question (/vlm and /vqa kept as aliases)
- panels, hints and help text updated to match

handle_vqa_query now reports via both push_log and direct stdout, so
VQA answers / overlay paths are visible in autonomous question mode
where the panel redraw is suspended.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../policies/smolvla2/inference/repl.py       | 12 +--
 .../smolvla2/inference/runtime_state.py       |  4 +-
 src/lerobot/policies/smolvla2/inference/ui.py |  2 +-
 .../policies/smolvla2/inference/vqa.py        | 36 +++++---
 .../scripts/lerobot_smolvla2_runtime.py       | 83 +++++++++++++------
 5 files changed, 88 insertions(+), 49 deletions(-)

diff --git a/src/lerobot/policies/smolvla2/inference/repl.py b/src/lerobot/policies/smolvla2/inference/repl.py
index 7ab84dbf2..11a4814ca 100644
--- a/src/lerobot/policies/smolvla2/inference/repl.py
+++ b/src/lerobot/policies/smolvla2/inference/repl.py
@@ -16,7 +16,7 @@
 Reads non-blocking stdin lines, classifies each one heuristically:
 
   "stop" / "quit" / "exit"               → state["stop"] = True
-  "/action" / "/vlm"                      → set state["mode"]
+  "/action" / "/question"                 → set state["mode"]
   ends with "?"                           → user_vqa_query event
   starts with "task:" or first line       → set runtime task
   anything else                           → user_interjection event
@@ -75,14 +75,14 @@ class StdinReader:
             state["stop"] = True
             return
 
-        # Slash commands flip the run mode. ``/vlm`` pauses the action
-        # loop (the action steps gate on ``state["mode"]``); ``/action``
-        # resumes it.
+        # Slash commands flip the run mode. ``/question`` pauses the
+        # action loop (the action steps gate on ``state["mode"]``);
+        # ``/action`` resumes it. ``/vlm`` / ``/vqa`` are kept as aliases.
         if lower in {"/action", "/act"}:
             state["mode"] = "action"
             return
-        if lower in {"/vlm", "/vqa"}:
-            state["mode"] = "vlm"
+        if lower in {"/question", "/q", "/vlm", "/vqa"}:
+            state["mode"] = "question"
             queue = state.get("action_queue")
             if hasattr(queue, "clear"):
                 queue.clear()
diff --git a/src/lerobot/policies/smolvla2/inference/runtime_state.py b/src/lerobot/policies/smolvla2/inference/runtime_state.py
index 49f2f8874..1a808846c 100644
--- a/src/lerobot/policies/smolvla2/inference/runtime_state.py
+++ b/src/lerobot/policies/smolvla2/inference/runtime_state.py
@@ -33,8 +33,8 @@ Stable keys (read by multiple steps):
   events_this_tick list[str]    triggers consumed this tick
   _tick         Tick            current tick (set by the loop)
 
-  mode          str             "action" (run the robot) | "vlm" (VQA only,
-                                 action loop paused)
+  mode          str             "action" (run the robot) | "question" (VQA
+                                 only, action loop paused)
 
   log_lines     list[str]       human-readable status lines printed each tick
 """
diff --git a/src/lerobot/policies/smolvla2/inference/ui.py b/src/lerobot/policies/smolvla2/inference/ui.py
index 567610bce..aa70eb9a8 100644
--- a/src/lerobot/policies/smolvla2/inference/ui.py
+++ b/src/lerobot/policies/smolvla2/inference/ui.py
@@ -93,7 +93,7 @@ def make_state_panel(state: dict[str, Any]) -> Any:
     table.add_row("", footer)
     run_mode = state.get("mode", "action")
     mode_tag = (
-        "[green]action[/]" if run_mode == "action" else "[yellow]vlm (paused)[/]"
+        "[green]action[/]" if run_mode == "action" else "[yellow]question (paused)[/]"
     )
     return Panel(
         table,
diff --git a/src/lerobot/policies/smolvla2/inference/vqa.py b/src/lerobot/policies/smolvla2/inference/vqa.py
index 3263b33bd..bdf345214 100644
--- a/src/lerobot/policies/smolvla2/inference/vqa.py
+++ b/src/lerobot/policies/smolvla2/inference/vqa.py
@@ -272,14 +272,24 @@ def handle_vqa_query(
     """Run one interactive VQA question end to end.
 
     Called synchronously from the input layer while the runtime is in
-    ``/vlm`` mode (the action loop is gated off, so the policy is not in
-    concurrent use). All progress is reported via :func:`push_log` so it
-    shows up in the state panel's scrollback.
+    ``/question`` mode (the action loop is gated off, so the policy is
+    not in concurrent use). Progress is reported via both
+    :func:`push_log` (REPL panel scrollback) and ``print_fn`` (direct
+    stdout) — in autonomous question mode the panel redraw is suspended,
+    so the direct print is what the operator actually sees.
     """
     from .steps import _generate_with_policy, _msgs_for_vqa  # noqa: PLC0415
 
+    def report(line: str) -> None:
+        """Surface a line both to the panel scrollback and to stdout."""
+        push_log(state, line)
+        try:
+            print_fn(line)
+        except Exception:  # noqa: BLE001
+            pass
+
     if policy is None or not hasattr(policy, "select_message"):
-        push_log(state, "  [warn] vqa: policy has no select_message — skipping")
+        report("  [warn] vqa: policy has no select_message — skipping")
         return
 
     observation: dict | None = None
@@ -294,11 +304,11 @@ def handle_vqa_query(
     if cameras:
         chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn)
         if chosen is None:
-            push_log(state, "  [info] vqa cancelled — no camera selected")
+            report("  [info] vqa cancelled — no camera selected")
             return
-        push_log(state, f"  vqa camera: {camera_short_name(chosen)}")
+        report(f"  vqa camera: {camera_short_name(chosen)}")
     else:
-        push_log(state, "  [info] vqa: no camera available — answering text-only")
+        report("  [info] vqa: no camera available — answering text-only")
 
     # Ground the question on the chosen camera only — filter the
     # observation to that one image (+ proprio state) so the VLM
@@ -317,23 +327,23 @@ def handle_vqa_query(
         label="vqa gen",
     )
     if not answer:
-        push_log(state, "  [info] vqa gen returned empty")
+        report("  [info] vqa gen returned empty")
         return
-    push_log(state, f"  vqa: {answer}")
+    report(f"  vqa: {answer}")
 
     parsed = parse_vqa_answer(answer)
     if not answer_has_overlay(parsed):
         if parsed is None:
-            push_log(state, "  [info] vqa answer is not JSON — no overlay")
+            report("  [info] vqa answer is not JSON — no overlay")
         return
     if observation is None or chosen is None:
-        push_log(state, "  [info] no camera image — cannot draw overlay")
+        report("  [info] no camera image — cannot draw overlay")
         return
     try:
         pil = observation_image_to_pil(observation[chosen])
         overlay = draw_vqa_overlay(pil, parsed)
         path = save_and_open_overlay(overlay)
-        push_log(state, f"  vqa overlay saved: {path}")
+        report(f"  vqa overlay saved: {path}")
     except Exception as exc:  # noqa: BLE001
         logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG))
-        push_log(state, f"  [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
+        report(f"  [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
diff --git a/src/lerobot/scripts/lerobot_smolvla2_runtime.py b/src/lerobot/scripts/lerobot_smolvla2_runtime.py
index 7c4153961..bb165a188 100644
--- a/src/lerobot/scripts/lerobot_smolvla2_runtime.py
+++ b/src/lerobot/scripts/lerobot_smolvla2_runtime.py
@@ -965,12 +965,33 @@ def _select_task_interactively(ds_meta: Any, current_task: str | None) -> str |
     return raw
 
 
+def _select_mode_interactively() -> str:
+    """Ask which mode to start in: ``action`` (run the robot) or
+    ``question`` (VQA only, robot paused).
+
+    Shown at startup, before the task picker. Non-TTY / scripted runs
+    default to ``action`` so existing pipelines are unaffected.
+    """
+    if not sys.stdin.isatty():
+        return "action"
+    print("[smolvla2] Start in which mode?", flush=True)
+    print("  [1] action   — run the robot autonomously (default)", flush=True)
+    print("  [2] question — ask the VLM questions (VQA); robot stays paused", flush=True)
+    try:
+        raw = input("mode> (Enter = action) ").strip().lower()
+    except (EOFError, KeyboardInterrupt):
+        return "action"
+    if raw in {"2", "question", "q", "/question", "/q", "vlm", "vqa", "/vlm", "/vqa"}:
+        return "question"
+    return "action"
+
+
 def _print_runtime_help() -> None:
     """Print the slash-command reference."""
     print(
         "[smolvla2] commands:\n"
         "  /action            run the robot (default mode)\n"
-        "  /vlm               pause the action loop; typed lines become VQA questions\n"
+        "  /question          pause the action loop; typed lines become VQA questions\n"
         "  /help              show this help\n"
         "  task: <text>       switch task (clears plan / memory / subtask)\n"
         "  rephrase: <text>   reword the task in place\n"
@@ -980,24 +1001,25 @@ def _print_runtime_help() -> None:
 
 
 def _handle_slash_command(runtime: Any, line: str) -> bool:
-    """Handle ``/action`` / ``/vlm`` / ``/help``.
+    """Handle ``/action`` / ``/question`` / ``/help``.
 
-    Returns ``True`` when ``line`` was a recognised command (and was
-    consumed), ``False`` otherwise.
+    ``/vlm`` and ``/vqa`` are kept as aliases for ``/question``. Returns
+    ``True`` when ``line`` was a recognised command (and was consumed),
+    ``False`` otherwise.
     """
     cmd = line.strip().lower()
     if cmd in {"/action", "/act"}:
         runtime.state["mode"] = "action"
         print("[smolvla2] mode: action — robot running", flush=True)
         return True
-    if cmd in {"/vlm", "/vqa"}:
-        runtime.state["mode"] = "vlm"
+    if cmd in {"/question", "/q", "/vlm", "/vqa"}:
+        runtime.state["mode"] = "question"
         # Drop any queued chunk so no stale action fires while paused.
         queue = runtime.state.get("action_queue")
         if hasattr(queue, "clear"):
             queue.clear()
         print(
-            "[smolvla2] mode: vlm — action loop paused; type VQA questions",
+            "[smolvla2] mode: question — action loop paused; type VQA questions",
             flush=True,
         )
         return True
@@ -1010,8 +1032,8 @@ def _handle_slash_command(runtime: Any, line: str) -> bool:
 def _run_vqa_query(runtime: Any, question: str) -> None:
     """Run one interactive VQA question against the runtime's policy.
 
-    Used by both loops when in ``/vlm`` mode — the action loop is paused
-    so the policy is free for a synchronous VQA call.
+    Used by both loops when in ``/question`` mode — the action loop is
+    paused so the policy is free for a synchronous VQA call.
     """
     from lerobot.policies.smolvla2.inference.vqa import handle_vqa_query  # noqa: PLC0415
 
@@ -1089,7 +1111,7 @@ def _run_autonomous(
     redraw()
     print(
         "  [autonomous] type interjections / '?' questions on stdin; "
-        "/vlm for VQA mode, /action to resume, /help for commands, "
+        "/question for VQA mode, /action to resume, /help for commands, "
         "'stop' or Ctrl+C to quit",
         flush=True,
     )
@@ -1133,18 +1155,18 @@ def _run_autonomous(
             lower = line.lower()
             if lower in {"stop", "quit", "exit"}:
                 break
-            # Slash commands (/action, /vlm, /help) flip the run mode.
+            # Slash commands (/action, /question, /help) flip the run mode.
             if _handle_slash_command(runtime, line):
                 # Redraw once so the panel reflects the new mode. In
-                # ``/vlm`` the timer redraw is now suspended, so this is
-                # the last clear — the VQA prompt below stays stable.
+                # ``/question`` the timer redraw is now suspended, so
+                # this is the last clear — the VQA prompt stays stable.
                 try:
                     redraw()
                 except Exception:  # noqa: BLE001
                     pass
-                if runtime.state.get("mode") == "vlm":
+                if runtime.state.get("mode") == "question":
                     print(
-                        "  [vlm] type a VQA question and press Enter; "
+                        "  [question] type a VQA question and press Enter; "
                         "/action to resume the robot.",
                         flush=True,
                     )
@@ -1187,10 +1209,10 @@ def _run_autonomous(
             if not runtime.state.get("task"):
                 runtime.set_task(line)
                 continue
-            # ``/vlm`` mode: the whole line is a VQA question, handled
-            # synchronously (the action loop is paused so the policy is
-            # not in concurrent use by the background runtime thread).
-            if runtime.state.get("mode", "action") == "vlm":
+            # ``/question`` mode: the whole line is a VQA question,
+            # handled synchronously (the action loop is paused so the
+            # policy is not in concurrent use by the background thread).
+            if runtime.state.get("mode", "action") == "question":
                 _run_vqa_query(runtime, line)
                 continue
             if lower.endswith("?"):
@@ -1242,7 +1264,7 @@ def _make_state_panel_renderer(
         mode_tag = (
             "[green]mode: action[/]"
             if run_mode == "action"
-            else "[yellow]mode: vlm (action loop paused)[/]"
+            else "[yellow]mode: question (action loop paused)[/]"
         )
         console.rule(
             f"[bold]SmolVLA2[/] · {mode_label} · {mode_tag}", style="cyan"
@@ -1252,7 +1274,7 @@ def _make_state_panel_renderer(
         # away under the timer redraw).
         if run_mode == "action":
             console.print(
-                "  [dim]commands:[/] [bold]/vlm[/] ask a VQA question  ·  "
+                "  [dim]commands:[/] [bold]/question[/] ask a VQA question  ·  "
                 "[bold]/help[/] all commands  ·  [bold]stop[/] quit"
             )
         else:
@@ -1335,7 +1357,7 @@ def _make_state_panel_renderer(
             console.print()
         if not st.get("task"):
             console.print(
-                "  [dim]Type the task to begin. /vlm switches to VQA mode, "
+                "  [dim]Type the task to begin. /question switches to VQA mode, "
                 "/action resumes the robot, /help lists commands. "
                 "Type 'stop' to exit.[/]"
             )
@@ -1438,6 +1460,11 @@ def main(argv: list[str] | None = None) -> int:
                 flush=True,
             )
 
+    # Startup mode prompt — choose action (run the robot) vs question
+    # (VQA only) *before* the task picker, so the operator sets intent
+    # up front. It can still be flipped any time with /action /question.
+    startup_mode = _select_mode_interactively()
+
     # Always offer the startup task picker on an interactive terminal:
     # list the dataset's tasks (the canonical / --task one shown as the
     # default) so the operator can pick another or type a custom task.
@@ -1518,6 +1545,8 @@ def main(argv: list[str] | None = None) -> int:
     runtime.state["text_gen_min_new_tokens"] = int(getattr(args, "text_min_new_tokens", 0) or 0)
     runtime.state["text_gen_temperature"] = float(getattr(args, "text_temperature", 0.0) or 0.0)
     runtime.state["text_gen_top_p"] = float(getattr(args, "text_top_p", 1.0) or 1.0)
+    # Apply the startup mode chosen above the task picker.
+    runtime.state["mode"] = startup_mode
     if args.task:
         runtime.set_task(args.task)
     # Seed the current subtask from the dataset so the first chunk —
@@ -1600,17 +1629,17 @@ def _run_repl(runtime: Any, *, initial_task: str | None, max_ticks: int | None)
             if lower in {"stop", "quit", "exit"}:
                 break
 
-            # Slash commands (/action, /vlm, /help) flip the run mode.
+            # Slash commands (/action, /question, /help) flip the run mode.
             if _handle_slash_command(runtime, line):
                 _redraw(last_logs)
                 continue
 
-            # ``/vlm`` mode: a typed line (that isn't a task command) is
-            # a VQA question — run it synchronously and skip the action
-            # pipeline tick entirely.
+            # ``/question`` mode: a typed line (that isn't a task
+            # command) is a VQA question — run it synchronously and skip
+            # the action pipeline tick entirely.
             if (
                 runtime.state.get("task")
-                and runtime.state.get("mode", "action") == "vlm"
+                and runtime.state.get("mode", "action") == "question"
                 and not lower.startswith(("task:", "rephrase:"))
             ):
                 runtime.state["log_lines"] = []