From 15229468d0dca89f863c475bf5a41d10b983c374 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 18 May 2026 14:17:03 +0200 Subject: [PATCH] feat(smolvla2): startup mode prompt; rename /vlm mode to /question Add a mode prompt at startup, shown before the task picker, so the operator chooses action (run the robot) vs question (VQA only) up front instead of having to discover /vlm mid-run. Also rename the VQA mode from "vlm" to the clearer "question": - state["mode"] value is now "action" | "question" - the command is /question (/vlm and /vqa kept as aliases) - panels, hints and help text updated to match handle_vqa_query now reports via both push_log and direct stdout, so VQA answers / overlay paths are visible in autonomous question mode where the panel redraw is suspended. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../policies/smolvla2/inference/repl.py | 12 +-- .../smolvla2/inference/runtime_state.py | 4 +- src/lerobot/policies/smolvla2/inference/ui.py | 2 +- .../policies/smolvla2/inference/vqa.py | 36 +++++--- .../scripts/lerobot_smolvla2_runtime.py | 83 +++++++++++++------ 5 files changed, 88 insertions(+), 49 deletions(-) diff --git a/src/lerobot/policies/smolvla2/inference/repl.py b/src/lerobot/policies/smolvla2/inference/repl.py index 7ab84dbf2..11a4814ca 100644 --- a/src/lerobot/policies/smolvla2/inference/repl.py +++ b/src/lerobot/policies/smolvla2/inference/repl.py @@ -16,7 +16,7 @@ Reads non-blocking stdin lines, classifies each one heuristically: "stop" / "quit" / "exit" → state["stop"] = True - "/action" / "/vlm" → set state["mode"] + "/action" / "/question" → set state["mode"] ends with "?" → user_vqa_query event starts with "task:" or first line → set runtime task anything else → user_interjection event @@ -75,14 +75,14 @@ class StdinReader: state["stop"] = True return - # Slash commands flip the run mode. ``/vlm`` pauses the action - # loop (the action steps gate on ``state["mode"]``); ``/action`` - # resumes it. + # Slash commands flip the run mode. ``/question`` pauses the + # action loop (the action steps gate on ``state["mode"]``); + # ``/action`` resumes it. ``/vlm`` / ``/vqa`` are kept as aliases. if lower in {"/action", "/act"}: state["mode"] = "action" return - if lower in {"/vlm", "/vqa"}: - state["mode"] = "vlm" + if lower in {"/question", "/q", "/vlm", "/vqa"}: + state["mode"] = "question" queue = state.get("action_queue") if hasattr(queue, "clear"): queue.clear() diff --git a/src/lerobot/policies/smolvla2/inference/runtime_state.py b/src/lerobot/policies/smolvla2/inference/runtime_state.py index 49f2f8874..1a808846c 100644 --- a/src/lerobot/policies/smolvla2/inference/runtime_state.py +++ b/src/lerobot/policies/smolvla2/inference/runtime_state.py @@ -33,8 +33,8 @@ Stable keys (read by multiple steps): events_this_tick list[str] triggers consumed this tick _tick Tick current tick (set by the loop) - mode str "action" (run the robot) | "vlm" (VQA only, - action loop paused) + mode str "action" (run the robot) | "question" (VQA + only, action loop paused) log_lines list[str] human-readable status lines printed each tick """ diff --git a/src/lerobot/policies/smolvla2/inference/ui.py b/src/lerobot/policies/smolvla2/inference/ui.py index 567610bce..aa70eb9a8 100644 --- a/src/lerobot/policies/smolvla2/inference/ui.py +++ b/src/lerobot/policies/smolvla2/inference/ui.py @@ -93,7 +93,7 @@ def make_state_panel(state: dict[str, Any]) -> Any: table.add_row("", footer) run_mode = state.get("mode", "action") mode_tag = ( - "[green]action[/]" if run_mode == "action" else "[yellow]vlm (paused)[/]" + "[green]action[/]" if run_mode == "action" else "[yellow]question (paused)[/]" ) return Panel( table, diff --git a/src/lerobot/policies/smolvla2/inference/vqa.py b/src/lerobot/policies/smolvla2/inference/vqa.py index 3263b33bd..bdf345214 100644 --- a/src/lerobot/policies/smolvla2/inference/vqa.py +++ b/src/lerobot/policies/smolvla2/inference/vqa.py @@ -272,14 +272,24 @@ def handle_vqa_query( """Run one interactive VQA question end to end. Called synchronously from the input layer while the runtime is in - ``/vlm`` mode (the action loop is gated off, so the policy is not in - concurrent use). All progress is reported via :func:`push_log` so it - shows up in the state panel's scrollback. + ``/question`` mode (the action loop is gated off, so the policy is + not in concurrent use). Progress is reported via both + :func:`push_log` (REPL panel scrollback) and ``print_fn`` (direct + stdout) — in autonomous question mode the panel redraw is suspended, + so the direct print is what the operator actually sees. """ from .steps import _generate_with_policy, _msgs_for_vqa # noqa: PLC0415 + def report(line: str) -> None: + """Surface a line both to the panel scrollback and to stdout.""" + push_log(state, line) + try: + print_fn(line) + except Exception: # noqa: BLE001 + pass + if policy is None or not hasattr(policy, "select_message"): - push_log(state, " [warn] vqa: policy has no select_message — skipping") + report(" [warn] vqa: policy has no select_message — skipping") return observation: dict | None = None @@ -294,11 +304,11 @@ def handle_vqa_query( if cameras: chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn) if chosen is None: - push_log(state, " [info] vqa cancelled — no camera selected") + report(" [info] vqa cancelled — no camera selected") return - push_log(state, f" vqa camera: {camera_short_name(chosen)}") + report(f" vqa camera: {camera_short_name(chosen)}") else: - push_log(state, " [info] vqa: no camera available — answering text-only") + report(" [info] vqa: no camera available — answering text-only") # Ground the question on the chosen camera only — filter the # observation to that one image (+ proprio state) so the VLM @@ -317,23 +327,23 @@ def handle_vqa_query( label="vqa gen", ) if not answer: - push_log(state, " [info] vqa gen returned empty") + report(" [info] vqa gen returned empty") return - push_log(state, f" vqa: {answer}") + report(f" vqa: {answer}") parsed = parse_vqa_answer(answer) if not answer_has_overlay(parsed): if parsed is None: - push_log(state, " [info] vqa answer is not JSON — no overlay") + report(" [info] vqa answer is not JSON — no overlay") return if observation is None or chosen is None: - push_log(state, " [info] no camera image — cannot draw overlay") + report(" [info] no camera image — cannot draw overlay") return try: pil = observation_image_to_pil(observation[chosen]) overlay = draw_vqa_overlay(pil, parsed) path = save_and_open_overlay(overlay) - push_log(state, f" vqa overlay saved: {path}") + report(f" vqa overlay saved: {path}") except Exception as exc: # noqa: BLE001 logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG)) - push_log(state, f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}") + report(f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}") diff --git a/src/lerobot/scripts/lerobot_smolvla2_runtime.py b/src/lerobot/scripts/lerobot_smolvla2_runtime.py index 7c4153961..bb165a188 100644 --- a/src/lerobot/scripts/lerobot_smolvla2_runtime.py +++ b/src/lerobot/scripts/lerobot_smolvla2_runtime.py @@ -965,12 +965,33 @@ def _select_task_interactively(ds_meta: Any, current_task: str | None) -> str | return raw +def _select_mode_interactively() -> str: + """Ask which mode to start in: ``action`` (run the robot) or + ``question`` (VQA only, robot paused). + + Shown at startup, before the task picker. Non-TTY / scripted runs + default to ``action`` so existing pipelines are unaffected. + """ + if not sys.stdin.isatty(): + return "action" + print("[smolvla2] Start in which mode?", flush=True) + print(" [1] action — run the robot autonomously (default)", flush=True) + print(" [2] question — ask the VLM questions (VQA); robot stays paused", flush=True) + try: + raw = input("mode> (Enter = action) ").strip().lower() + except (EOFError, KeyboardInterrupt): + return "action" + if raw in {"2", "question", "q", "/question", "/q", "vlm", "vqa", "/vlm", "/vqa"}: + return "question" + return "action" + + def _print_runtime_help() -> None: """Print the slash-command reference.""" print( "[smolvla2] commands:\n" " /action run the robot (default mode)\n" - " /vlm pause the action loop; typed lines become VQA questions\n" + " /question pause the action loop; typed lines become VQA questions\n" " /help show this help\n" " task: switch task (clears plan / memory / subtask)\n" " rephrase: reword the task in place\n" @@ -980,24 +1001,25 @@ def _print_runtime_help() -> None: def _handle_slash_command(runtime: Any, line: str) -> bool: - """Handle ``/action`` / ``/vlm`` / ``/help``. + """Handle ``/action`` / ``/question`` / ``/help``. - Returns ``True`` when ``line`` was a recognised command (and was - consumed), ``False`` otherwise. + ``/vlm`` and ``/vqa`` are kept as aliases for ``/question``. Returns + ``True`` when ``line`` was a recognised command (and was consumed), + ``False`` otherwise. """ cmd = line.strip().lower() if cmd in {"/action", "/act"}: runtime.state["mode"] = "action" print("[smolvla2] mode: action — robot running", flush=True) return True - if cmd in {"/vlm", "/vqa"}: - runtime.state["mode"] = "vlm" + if cmd in {"/question", "/q", "/vlm", "/vqa"}: + runtime.state["mode"] = "question" # Drop any queued chunk so no stale action fires while paused. queue = runtime.state.get("action_queue") if hasattr(queue, "clear"): queue.clear() print( - "[smolvla2] mode: vlm — action loop paused; type VQA questions", + "[smolvla2] mode: question — action loop paused; type VQA questions", flush=True, ) return True @@ -1010,8 +1032,8 @@ def _handle_slash_command(runtime: Any, line: str) -> bool: def _run_vqa_query(runtime: Any, question: str) -> None: """Run one interactive VQA question against the runtime's policy. - Used by both loops when in ``/vlm`` mode — the action loop is paused - so the policy is free for a synchronous VQA call. + Used by both loops when in ``/question`` mode — the action loop is + paused so the policy is free for a synchronous VQA call. """ from lerobot.policies.smolvla2.inference.vqa import handle_vqa_query # noqa: PLC0415 @@ -1089,7 +1111,7 @@ def _run_autonomous( redraw() print( " [autonomous] type interjections / '?' questions on stdin; " - "/vlm for VQA mode, /action to resume, /help for commands, " + "/question for VQA mode, /action to resume, /help for commands, " "'stop' or Ctrl+C to quit", flush=True, ) @@ -1133,18 +1155,18 @@ def _run_autonomous( lower = line.lower() if lower in {"stop", "quit", "exit"}: break - # Slash commands (/action, /vlm, /help) flip the run mode. + # Slash commands (/action, /question, /help) flip the run mode. if _handle_slash_command(runtime, line): # Redraw once so the panel reflects the new mode. In - # ``/vlm`` the timer redraw is now suspended, so this is - # the last clear — the VQA prompt below stays stable. + # ``/question`` the timer redraw is now suspended, so + # this is the last clear — the VQA prompt stays stable. try: redraw() except Exception: # noqa: BLE001 pass - if runtime.state.get("mode") == "vlm": + if runtime.state.get("mode") == "question": print( - " [vlm] type a VQA question and press Enter; " + " [question] type a VQA question and press Enter; " "/action to resume the robot.", flush=True, ) @@ -1187,10 +1209,10 @@ def _run_autonomous( if not runtime.state.get("task"): runtime.set_task(line) continue - # ``/vlm`` mode: the whole line is a VQA question, handled - # synchronously (the action loop is paused so the policy is - # not in concurrent use by the background runtime thread). - if runtime.state.get("mode", "action") == "vlm": + # ``/question`` mode: the whole line is a VQA question, + # handled synchronously (the action loop is paused so the + # policy is not in concurrent use by the background thread). + if runtime.state.get("mode", "action") == "question": _run_vqa_query(runtime, line) continue if lower.endswith("?"): @@ -1242,7 +1264,7 @@ def _make_state_panel_renderer( mode_tag = ( "[green]mode: action[/]" if run_mode == "action" - else "[yellow]mode: vlm (action loop paused)[/]" + else "[yellow]mode: question (action loop paused)[/]" ) console.rule( f"[bold]SmolVLA2[/] · {mode_label} · {mode_tag}", style="cyan" @@ -1252,7 +1274,7 @@ def _make_state_panel_renderer( # away under the timer redraw). if run_mode == "action": console.print( - " [dim]commands:[/] [bold]/vlm[/] ask a VQA question · " + " [dim]commands:[/] [bold]/question[/] ask a VQA question · " "[bold]/help[/] all commands · [bold]stop[/] quit" ) else: @@ -1335,7 +1357,7 @@ def _make_state_panel_renderer( console.print() if not st.get("task"): console.print( - " [dim]Type the task to begin. /vlm switches to VQA mode, " + " [dim]Type the task to begin. /question switches to VQA mode, " "/action resumes the robot, /help lists commands. " "Type 'stop' to exit.[/]" ) @@ -1438,6 +1460,11 @@ def main(argv: list[str] | None = None) -> int: flush=True, ) + # Startup mode prompt — choose action (run the robot) vs question + # (VQA only) *before* the task picker, so the operator sets intent + # up front. It can still be flipped any time with /action /question. + startup_mode = _select_mode_interactively() + # Always offer the startup task picker on an interactive terminal: # list the dataset's tasks (the canonical / --task one shown as the # default) so the operator can pick another or type a custom task. @@ -1518,6 +1545,8 @@ def main(argv: list[str] | None = None) -> int: runtime.state["text_gen_min_new_tokens"] = int(getattr(args, "text_min_new_tokens", 0) or 0) runtime.state["text_gen_temperature"] = float(getattr(args, "text_temperature", 0.0) or 0.0) runtime.state["text_gen_top_p"] = float(getattr(args, "text_top_p", 1.0) or 1.0) + # Apply the startup mode chosen above the task picker. + runtime.state["mode"] = startup_mode if args.task: runtime.set_task(args.task) # Seed the current subtask from the dataset so the first chunk — @@ -1600,17 +1629,17 @@ def _run_repl(runtime: Any, *, initial_task: str | None, max_ticks: int | None) if lower in {"stop", "quit", "exit"}: break - # Slash commands (/action, /vlm, /help) flip the run mode. + # Slash commands (/action, /question, /help) flip the run mode. if _handle_slash_command(runtime, line): _redraw(last_logs) continue - # ``/vlm`` mode: a typed line (that isn't a task command) is - # a VQA question — run it synchronously and skip the action - # pipeline tick entirely. + # ``/question`` mode: a typed line (that isn't a task + # command) is a VQA question — run it synchronously and skip + # the action pipeline tick entirely. if ( runtime.state.get("task") - and runtime.state.get("mode", "action") == "vlm" + and runtime.state.get("mode", "action") == "question" and not lower.startswith(("task:", "rephrase:")) ): runtime.state["log_lines"] = []