feat(smolvla2): startup mode prompt; rename /vlm mode to /question

Add a mode prompt at startup, shown before the task picker, so the operator chooses action (run the robot) vs question (VQA only) up front instead of having to discover /vlm mid-run. Also rename the VQA mode from "vlm" to the clearer "question": - state["mode"] value is now "action" | "question" - the command is /question (/vlm and /vqa kept as aliases) - panels, hints and help text updated to match handle_vqa_query now reports via both push_log and direct stdout, so VQA answers / overlay paths are visible in autonomous question mode where the panel redraw is suspended. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 10:40:04 +00:00 · 2026-05-18 14:17:03 +02:00
parent a9cea3e8dd
commit 15229468d0
5 changed files with 88 additions and 49 deletions
@@ -16,7 +16,7 @@
 Reads non-blocking stdin lines, classifies each one heuristically:

  "stop" / "quit" / "exit"               → state["stop"] = True
-  "/action" / "/vlm"                      → set state["mode"]
+  "/action" / "/question"                 → set state["mode"]
  ends with "?"                           → user_vqa_query event
  starts with "task:" or first line       → set runtime task
  anything else                           → user_interjection event
@@ -75,14 +75,14 @@ class StdinReader:
            state["stop"] = True
            return

-        # Slash commands flip the run mode. ``/vlm`` pauses the action
-        # loop (the action steps gate on ``state["mode"]``); ``/action``
-        # resumes it.
+        # Slash commands flip the run mode. ``/question`` pauses the
+        # action loop (the action steps gate on ``state["mode"]``);
+        # ``/action`` resumes it. ``/vlm`` / ``/vqa`` are kept as aliases.
        if lower in {"/action", "/act"}:
            state["mode"] = "action"
            return
-        if lower in {"/vlm", "/vqa"}:
-            state["mode"] = "vlm"
+        if lower in {"/question", "/q", "/vlm", "/vqa"}:
+            state["mode"] = "question"
            queue = state.get("action_queue")
            if hasattr(queue, "clear"):
                queue.clear()
@@ -33,8 +33,8 @@ Stable keys (read by multiple steps):
  events_this_tick list[str]    triggers consumed this tick
  _tick         Tick            current tick (set by the loop)

-  mode          str             "action" (run the robot) | "vlm" (VQA only,
-                                 action loop paused)
+  mode          str             "action" (run the robot) | "question" (VQA
+                                 only, action loop paused)

  log_lines     list[str]       human-readable status lines printed each tick
 """
@@ -93,7 +93,7 @@ def make_state_panel(state: dict[str, Any]) -> Any:
    table.add_row("", footer)
    run_mode = state.get("mode", "action")
    mode_tag = (
-        "[green]action[/]" if run_mode == "action" else "[yellow]vlm (paused)[/]"
+        "[green]action[/]" if run_mode == "action" else "[yellow]question (paused)[/]"
    )
    return Panel(
        table,
@@ -272,14 +272,24 @@ def handle_vqa_query(
    """Run one interactive VQA question end to end.

    Called synchronously from the input layer while the runtime is in
-    ``/vlm`` mode (the action loop is gated off, so the policy is not in
-    concurrent use). All progress is reported via :func:`push_log` so it
-    shows up in the state panel's scrollback.
+    ``/question`` mode (the action loop is gated off, so the policy is
+    not in concurrent use). Progress is reported via both
+    :func:`push_log` (REPL panel scrollback) and ``print_fn`` (direct
+    stdout) — in autonomous question mode the panel redraw is suspended,
+    so the direct print is what the operator actually sees.
    """
    from .steps import _generate_with_policy, _msgs_for_vqa  # noqa: PLC0415

+    def report(line: str) -> None:
+        """Surface a line both to the panel scrollback and to stdout."""
+        push_log(state, line)
+        try:
+            print_fn(line)
+        except Exception:  # noqa: BLE001
+            pass
+
    if policy is None or not hasattr(policy, "select_message"):
-        push_log(state, "  [warn] vqa: policy has no select_message — skipping")
+        report("  [warn] vqa: policy has no select_message — skipping")
        return

    observation: dict | None = None
@@ -294,11 +304,11 @@ def handle_vqa_query(
    if cameras:
        chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn)
        if chosen is None:
-            push_log(state, "  [info] vqa cancelled — no camera selected")
+            report("  [info] vqa cancelled — no camera selected")
            return
-        push_log(state, f"  vqa camera: {camera_short_name(chosen)}")
+        report(f"  vqa camera: {camera_short_name(chosen)}")
    else:
-        push_log(state, "  [info] vqa: no camera available — answering text-only")
+        report("  [info] vqa: no camera available — answering text-only")

    # Ground the question on the chosen camera only — filter the
    # observation to that one image (+ proprio state) so the VLM
@@ -317,23 +327,23 @@ def handle_vqa_query(
        label="vqa gen",
    )
    if not answer:
-        push_log(state, "  [info] vqa gen returned empty")
+        report("  [info] vqa gen returned empty")
        return
-    push_log(state, f"  vqa: {answer}")
+    report(f"  vqa: {answer}")

    parsed = parse_vqa_answer(answer)
    if not answer_has_overlay(parsed):
        if parsed is None:
-            push_log(state, "  [info] vqa answer is not JSON — no overlay")
+            report("  [info] vqa answer is not JSON — no overlay")
        return
    if observation is None or chosen is None:
-        push_log(state, "  [info] no camera image — cannot draw overlay")
+        report("  [info] no camera image — cannot draw overlay")
        return
    try:
        pil = observation_image_to_pil(observation[chosen])
        overlay = draw_vqa_overlay(pil, parsed)
        path = save_and_open_overlay(overlay)
-        push_log(state, f"  vqa overlay saved: {path}")
+        report(f"  vqa overlay saved: {path}")
    except Exception as exc:  # noqa: BLE001
        logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG))
-        push_log(state, f"  [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
+        report(f"  [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
@@ -965,12 +965,33 @@ def _select_task_interactively(ds_meta: Any, current_task: str | None) -> str |
    return raw


+def _select_mode_interactively() -> str:
+    """Ask which mode to start in: ``action`` (run the robot) or
+    ``question`` (VQA only, robot paused).
+
+    Shown at startup, before the task picker. Non-TTY / scripted runs
+    default to ``action`` so existing pipelines are unaffected.
+    """
+    if not sys.stdin.isatty():
+        return "action"
+    print("[smolvla2] Start in which mode?", flush=True)
+    print("  [1] action   — run the robot autonomously (default)", flush=True)
+    print("  [2] question — ask the VLM questions (VQA); robot stays paused", flush=True)
+    try:
+        raw = input("mode> (Enter = action) ").strip().lower()
+    except (EOFError, KeyboardInterrupt):
+        return "action"
+    if raw in {"2", "question", "q", "/question", "/q", "vlm", "vqa", "/vlm", "/vqa"}:
+        return "question"
+    return "action"
+
+
 def _print_runtime_help() -> None:
    """Print the slash-command reference."""
    print(
        "[smolvla2] commands:\n"
        "  /action            run the robot (default mode)\n"
-        "  /vlm               pause the action loop; typed lines become VQA questions\n"
+        "  /question          pause the action loop; typed lines become VQA questions\n"
        "  /help              show this help\n"
        "  task: <text>       switch task (clears plan / memory / subtask)\n"
        "  rephrase: <text>   reword the task in place\n"
@@ -980,24 +1001,25 @@ def _print_runtime_help() -> None:


 def _handle_slash_command(runtime: Any, line: str) -> bool:
-    """Handle ``/action`` / ``/vlm`` / ``/help``.
+    """Handle ``/action`` / ``/question`` / ``/help``.

-    Returns ``True`` when ``line`` was a recognised command (and was
-    consumed), ``False`` otherwise.
+    ``/vlm`` and ``/vqa`` are kept as aliases for ``/question``. Returns
+    ``True`` when ``line`` was a recognised command (and was consumed),
+    ``False`` otherwise.
    """
    cmd = line.strip().lower()
    if cmd in {"/action", "/act"}:
        runtime.state["mode"] = "action"
        print("[smolvla2] mode: action — robot running", flush=True)
        return True
-    if cmd in {"/vlm", "/vqa"}:
-        runtime.state["mode"] = "vlm"
+    if cmd in {"/question", "/q", "/vlm", "/vqa"}:
+        runtime.state["mode"] = "question"
        # Drop any queued chunk so no stale action fires while paused.
        queue = runtime.state.get("action_queue")
        if hasattr(queue, "clear"):
            queue.clear()
        print(
-            "[smolvla2] mode: vlm — action loop paused; type VQA questions",
+            "[smolvla2] mode: question — action loop paused; type VQA questions",
            flush=True,
        )
        return True
@@ -1010,8 +1032,8 @@ def _handle_slash_command(runtime: Any, line: str) -> bool:
 def _run_vqa_query(runtime: Any, question: str) -> None:
    """Run one interactive VQA question against the runtime's policy.

-    Used by both loops when in ``/vlm`` mode — the action loop is paused
-    so the policy is free for a synchronous VQA call.
+    Used by both loops when in ``/question`` mode — the action loop is
+    paused so the policy is free for a synchronous VQA call.
    """
    from lerobot.policies.smolvla2.inference.vqa import handle_vqa_query  # noqa: PLC0415

@@ -1089,7 +1111,7 @@ def _run_autonomous(
    redraw()
    print(
        "  [autonomous] type interjections / '?' questions on stdin; "
-        "/vlm for VQA mode, /action to resume, /help for commands, "
+        "/question for VQA mode, /action to resume, /help for commands, "
        "'stop' or Ctrl+C to quit",
        flush=True,
    )
@@ -1133,18 +1155,18 @@ def _run_autonomous(
            lower = line.lower()
            if lower in {"stop", "quit", "exit"}:
                break
-            # Slash commands (/action, /vlm, /help) flip the run mode.
+            # Slash commands (/action, /question, /help) flip the run mode.
            if _handle_slash_command(runtime, line):
                # Redraw once so the panel reflects the new mode. In
-                # ``/vlm`` the timer redraw is now suspended, so this is
-                # the last clear — the VQA prompt below stays stable.
+                # ``/question`` the timer redraw is now suspended, so
+                # this is the last clear — the VQA prompt stays stable.
                try:
                    redraw()
                except Exception:  # noqa: BLE001
                    pass
-                if runtime.state.get("mode") == "vlm":
+                if runtime.state.get("mode") == "question":
                    print(
-                        "  [vlm] type a VQA question and press Enter; "
+                        "  [question] type a VQA question and press Enter; "
                        "/action to resume the robot.",
                        flush=True,
                    )
@@ -1187,10 +1209,10 @@ def _run_autonomous(
            if not runtime.state.get("task"):
                runtime.set_task(line)
                continue
-            # ``/vlm`` mode: the whole line is a VQA question, handled
-            # synchronously (the action loop is paused so the policy is
-            # not in concurrent use by the background runtime thread).
-            if runtime.state.get("mode", "action") == "vlm":
+            # ``/question`` mode: the whole line is a VQA question,
+            # handled synchronously (the action loop is paused so the
+            # policy is not in concurrent use by the background thread).
+            if runtime.state.get("mode", "action") == "question":
                _run_vqa_query(runtime, line)
                continue
            if lower.endswith("?"):
@@ -1242,7 +1264,7 @@ def _make_state_panel_renderer(
        mode_tag = (
            "[green]mode: action[/]"
            if run_mode == "action"
-            else "[yellow]mode: vlm (action loop paused)[/]"
+            else "[yellow]mode: question (action loop paused)[/]"
        )
        console.rule(
            f"[bold]SmolVLA2[/] · {mode_label} · {mode_tag}", style="cyan"
@@ -1252,7 +1274,7 @@ def _make_state_panel_renderer(
        # away under the timer redraw).
        if run_mode == "action":
            console.print(
-                "  [dim]commands:[/] [bold]/vlm[/] ask a VQA question  ·  "
+                "  [dim]commands:[/] [bold]/question[/] ask a VQA question  ·  "
                "[bold]/help[/] all commands  ·  [bold]stop[/] quit"
            )
        else:
@@ -1335,7 +1357,7 @@ def _make_state_panel_renderer(
            console.print()
        if not st.get("task"):
            console.print(
-                "  [dim]Type the task to begin. /vlm switches to VQA mode, "
+                "  [dim]Type the task to begin. /question switches to VQA mode, "
                "/action resumes the robot, /help lists commands. "
                "Type 'stop' to exit.[/]"
            )
@@ -1438,6 +1460,11 @@ def main(argv: list[str] | None = None) -> int:
                flush=True,
            )

+    # Startup mode prompt — choose action (run the robot) vs question
+    # (VQA only) *before* the task picker, so the operator sets intent
+    # up front. It can still be flipped any time with /action /question.
+    startup_mode = _select_mode_interactively()
+
    # Always offer the startup task picker on an interactive terminal:
    # list the dataset's tasks (the canonical / --task one shown as the
    # default) so the operator can pick another or type a custom task.
@@ -1518,6 +1545,8 @@ def main(argv: list[str] | None = None) -> int:
    runtime.state["text_gen_min_new_tokens"] = int(getattr(args, "text_min_new_tokens", 0) or 0)
    runtime.state["text_gen_temperature"] = float(getattr(args, "text_temperature", 0.0) or 0.0)
    runtime.state["text_gen_top_p"] = float(getattr(args, "text_top_p", 1.0) or 1.0)
+    # Apply the startup mode chosen above the task picker.
+    runtime.state["mode"] = startup_mode
    if args.task:
        runtime.set_task(args.task)
    # Seed the current subtask from the dataset so the first chunk —
@@ -1600,17 +1629,17 @@ def _run_repl(runtime: Any, *, initial_task: str | None, max_ticks: int | None)
            if lower in {"stop", "quit", "exit"}:
                break

-            # Slash commands (/action, /vlm, /help) flip the run mode.
+            # Slash commands (/action, /question, /help) flip the run mode.
            if _handle_slash_command(runtime, line):
                _redraw(last_logs)
                continue

-            # ``/vlm`` mode: a typed line (that isn't a task command) is
-            # a VQA question — run it synchronously and skip the action
-            # pipeline tick entirely.
+            # ``/question`` mode: a typed line (that isn't a task
+            # command) is a VQA question — run it synchronously and skip
+            # the action pipeline tick entirely.
            if (
                runtime.state.get("task")
-                and runtime.state.get("mode", "action") == "vlm"
+                and runtime.state.get("mode", "action") == "question"
                and not lower.startswith(("task:", "rephrase:"))
            ):
                runtime.state["log_lines"] = []