feat(smolvla2): startup mode prompt; rename /vlm mode to /question

Add a mode prompt at startup, shown before the task picker, so the
operator chooses action (run the robot) vs question (VQA only) up front
instead of having to discover /vlm mid-run.

Also rename the VQA mode from "vlm" to the clearer "question":
- state["mode"] value is now "action" | "question"
- the command is /question (/vlm and /vqa kept as aliases)
- panels, hints and help text updated to match

handle_vqa_query now reports via both push_log and direct stdout, so
VQA answers / overlay paths are visible in autonomous question mode
where the panel redraw is suspended.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-18 14:17:03 +02:00
parent a9cea3e8dd
commit 15229468d0
5 changed files with 88 additions and 49 deletions
@@ -16,7 +16,7 @@
Reads non-blocking stdin lines, classifies each one heuristically: Reads non-blocking stdin lines, classifies each one heuristically:
"stop" / "quit" / "exit" → state["stop"] = True "stop" / "quit" / "exit" → state["stop"] = True
"/action" / "/vlm" → set state["mode"] "/action" / "/question" → set state["mode"]
ends with "?" → user_vqa_query event ends with "?" → user_vqa_query event
starts with "task:" or first line → set runtime task starts with "task:" or first line → set runtime task
anything else → user_interjection event anything else → user_interjection event
@@ -75,14 +75,14 @@ class StdinReader:
state["stop"] = True state["stop"] = True
return return
# Slash commands flip the run mode. ``/vlm`` pauses the action # Slash commands flip the run mode. ``/question`` pauses the
# loop (the action steps gate on ``state["mode"]``); ``/action`` # action loop (the action steps gate on ``state["mode"]``);
# resumes it. # ``/action`` resumes it. ``/vlm`` / ``/vqa`` are kept as aliases.
if lower in {"/action", "/act"}: if lower in {"/action", "/act"}:
state["mode"] = "action" state["mode"] = "action"
return return
if lower in {"/vlm", "/vqa"}: if lower in {"/question", "/q", "/vlm", "/vqa"}:
state["mode"] = "vlm" state["mode"] = "question"
queue = state.get("action_queue") queue = state.get("action_queue")
if hasattr(queue, "clear"): if hasattr(queue, "clear"):
queue.clear() queue.clear()
@@ -33,8 +33,8 @@ Stable keys (read by multiple steps):
events_this_tick list[str] triggers consumed this tick events_this_tick list[str] triggers consumed this tick
_tick Tick current tick (set by the loop) _tick Tick current tick (set by the loop)
mode str "action" (run the robot) | "vlm" (VQA only, mode str "action" (run the robot) | "question" (VQA
action loop paused) only, action loop paused)
log_lines list[str] human-readable status lines printed each tick log_lines list[str] human-readable status lines printed each tick
""" """
@@ -93,7 +93,7 @@ def make_state_panel(state: dict[str, Any]) -> Any:
table.add_row("", footer) table.add_row("", footer)
run_mode = state.get("mode", "action") run_mode = state.get("mode", "action")
mode_tag = ( mode_tag = (
"[green]action[/]" if run_mode == "action" else "[yellow]vlm (paused)[/]" "[green]action[/]" if run_mode == "action" else "[yellow]question (paused)[/]"
) )
return Panel( return Panel(
table, table,
+23 -13
View File
@@ -272,14 +272,24 @@ def handle_vqa_query(
"""Run one interactive VQA question end to end. """Run one interactive VQA question end to end.
Called synchronously from the input layer while the runtime is in Called synchronously from the input layer while the runtime is in
``/vlm`` mode (the action loop is gated off, so the policy is not in ``/question`` mode (the action loop is gated off, so the policy is
concurrent use). All progress is reported via :func:`push_log` so it not in concurrent use). Progress is reported via both
shows up in the state panel's scrollback. :func:`push_log` (REPL panel scrollback) and ``print_fn`` (direct
stdout) in autonomous question mode the panel redraw is suspended,
so the direct print is what the operator actually sees.
""" """
from .steps import _generate_with_policy, _msgs_for_vqa # noqa: PLC0415 from .steps import _generate_with_policy, _msgs_for_vqa # noqa: PLC0415
def report(line: str) -> None:
"""Surface a line both to the panel scrollback and to stdout."""
push_log(state, line)
try:
print_fn(line)
except Exception: # noqa: BLE001
pass
if policy is None or not hasattr(policy, "select_message"): if policy is None or not hasattr(policy, "select_message"):
push_log(state, " [warn] vqa: policy has no select_message — skipping") report(" [warn] vqa: policy has no select_message — skipping")
return return
observation: dict | None = None observation: dict | None = None
@@ -294,11 +304,11 @@ def handle_vqa_query(
if cameras: if cameras:
chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn) chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn)
if chosen is None: if chosen is None:
push_log(state, " [info] vqa cancelled — no camera selected") report(" [info] vqa cancelled — no camera selected")
return return
push_log(state, f" vqa camera: {camera_short_name(chosen)}") report(f" vqa camera: {camera_short_name(chosen)}")
else: else:
push_log(state, " [info] vqa: no camera available — answering text-only") report(" [info] vqa: no camera available — answering text-only")
# Ground the question on the chosen camera only — filter the # Ground the question on the chosen camera only — filter the
# observation to that one image (+ proprio state) so the VLM # observation to that one image (+ proprio state) so the VLM
@@ -317,23 +327,23 @@ def handle_vqa_query(
label="vqa gen", label="vqa gen",
) )
if not answer: if not answer:
push_log(state, " [info] vqa gen returned empty") report(" [info] vqa gen returned empty")
return return
push_log(state, f" vqa: {answer}") report(f" vqa: {answer}")
parsed = parse_vqa_answer(answer) parsed = parse_vqa_answer(answer)
if not answer_has_overlay(parsed): if not answer_has_overlay(parsed):
if parsed is None: if parsed is None:
push_log(state, " [info] vqa answer is not JSON — no overlay") report(" [info] vqa answer is not JSON — no overlay")
return return
if observation is None or chosen is None: if observation is None or chosen is None:
push_log(state, " [info] no camera image — cannot draw overlay") report(" [info] no camera image — cannot draw overlay")
return return
try: try:
pil = observation_image_to_pil(observation[chosen]) pil = observation_image_to_pil(observation[chosen])
overlay = draw_vqa_overlay(pil, parsed) overlay = draw_vqa_overlay(pil, parsed)
path = save_and_open_overlay(overlay) path = save_and_open_overlay(overlay)
push_log(state, f" vqa overlay saved: {path}") report(f" vqa overlay saved: {path}")
except Exception as exc: # noqa: BLE001 except Exception as exc: # noqa: BLE001
logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG)) logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG))
push_log(state, f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}") report(f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
+56 -27
View File
@@ -965,12 +965,33 @@ def _select_task_interactively(ds_meta: Any, current_task: str | None) -> str |
return raw return raw
def _select_mode_interactively() -> str:
"""Ask which mode to start in: ``action`` (run the robot) or
``question`` (VQA only, robot paused).
Shown at startup, before the task picker. Non-TTY / scripted runs
default to ``action`` so existing pipelines are unaffected.
"""
if not sys.stdin.isatty():
return "action"
print("[smolvla2] Start in which mode?", flush=True)
print(" [1] action — run the robot autonomously (default)", flush=True)
print(" [2] question — ask the VLM questions (VQA); robot stays paused", flush=True)
try:
raw = input("mode> (Enter = action) ").strip().lower()
except (EOFError, KeyboardInterrupt):
return "action"
if raw in {"2", "question", "q", "/question", "/q", "vlm", "vqa", "/vlm", "/vqa"}:
return "question"
return "action"
def _print_runtime_help() -> None: def _print_runtime_help() -> None:
"""Print the slash-command reference.""" """Print the slash-command reference."""
print( print(
"[smolvla2] commands:\n" "[smolvla2] commands:\n"
" /action run the robot (default mode)\n" " /action run the robot (default mode)\n"
" /vlm pause the action loop; typed lines become VQA questions\n" " /question pause the action loop; typed lines become VQA questions\n"
" /help show this help\n" " /help show this help\n"
" task: <text> switch task (clears plan / memory / subtask)\n" " task: <text> switch task (clears plan / memory / subtask)\n"
" rephrase: <text> reword the task in place\n" " rephrase: <text> reword the task in place\n"
@@ -980,24 +1001,25 @@ def _print_runtime_help() -> None:
def _handle_slash_command(runtime: Any, line: str) -> bool: def _handle_slash_command(runtime: Any, line: str) -> bool:
"""Handle ``/action`` / ``/vlm`` / ``/help``. """Handle ``/action`` / ``/question`` / ``/help``.
Returns ``True`` when ``line`` was a recognised command (and was ``/vlm`` and ``/vqa`` are kept as aliases for ``/question``. Returns
consumed), ``False`` otherwise. ``True`` when ``line`` was a recognised command (and was consumed),
``False`` otherwise.
""" """
cmd = line.strip().lower() cmd = line.strip().lower()
if cmd in {"/action", "/act"}: if cmd in {"/action", "/act"}:
runtime.state["mode"] = "action" runtime.state["mode"] = "action"
print("[smolvla2] mode: action — robot running", flush=True) print("[smolvla2] mode: action — robot running", flush=True)
return True return True
if cmd in {"/vlm", "/vqa"}: if cmd in {"/question", "/q", "/vlm", "/vqa"}:
runtime.state["mode"] = "vlm" runtime.state["mode"] = "question"
# Drop any queued chunk so no stale action fires while paused. # Drop any queued chunk so no stale action fires while paused.
queue = runtime.state.get("action_queue") queue = runtime.state.get("action_queue")
if hasattr(queue, "clear"): if hasattr(queue, "clear"):
queue.clear() queue.clear()
print( print(
"[smolvla2] mode: vlm — action loop paused; type VQA questions", "[smolvla2] mode: question — action loop paused; type VQA questions",
flush=True, flush=True,
) )
return True return True
@@ -1010,8 +1032,8 @@ def _handle_slash_command(runtime: Any, line: str) -> bool:
def _run_vqa_query(runtime: Any, question: str) -> None: def _run_vqa_query(runtime: Any, question: str) -> None:
"""Run one interactive VQA question against the runtime's policy. """Run one interactive VQA question against the runtime's policy.
Used by both loops when in ``/vlm`` mode the action loop is paused Used by both loops when in ``/question`` mode the action loop is
so the policy is free for a synchronous VQA call. paused so the policy is free for a synchronous VQA call.
""" """
from lerobot.policies.smolvla2.inference.vqa import handle_vqa_query # noqa: PLC0415 from lerobot.policies.smolvla2.inference.vqa import handle_vqa_query # noqa: PLC0415
@@ -1089,7 +1111,7 @@ def _run_autonomous(
redraw() redraw()
print( print(
" [autonomous] type interjections / '?' questions on stdin; " " [autonomous] type interjections / '?' questions on stdin; "
"/vlm for VQA mode, /action to resume, /help for commands, " "/question for VQA mode, /action to resume, /help for commands, "
"'stop' or Ctrl+C to quit", "'stop' or Ctrl+C to quit",
flush=True, flush=True,
) )
@@ -1133,18 +1155,18 @@ def _run_autonomous(
lower = line.lower() lower = line.lower()
if lower in {"stop", "quit", "exit"}: if lower in {"stop", "quit", "exit"}:
break break
# Slash commands (/action, /vlm, /help) flip the run mode. # Slash commands (/action, /question, /help) flip the run mode.
if _handle_slash_command(runtime, line): if _handle_slash_command(runtime, line):
# Redraw once so the panel reflects the new mode. In # Redraw once so the panel reflects the new mode. In
# ``/vlm`` the timer redraw is now suspended, so this is # ``/question`` the timer redraw is now suspended, so
# the last clear — the VQA prompt below stays stable. # this is the last clear — the VQA prompt stays stable.
try: try:
redraw() redraw()
except Exception: # noqa: BLE001 except Exception: # noqa: BLE001
pass pass
if runtime.state.get("mode") == "vlm": if runtime.state.get("mode") == "question":
print( print(
" [vlm] type a VQA question and press Enter; " " [question] type a VQA question and press Enter; "
"/action to resume the robot.", "/action to resume the robot.",
flush=True, flush=True,
) )
@@ -1187,10 +1209,10 @@ def _run_autonomous(
if not runtime.state.get("task"): if not runtime.state.get("task"):
runtime.set_task(line) runtime.set_task(line)
continue continue
# ``/vlm`` mode: the whole line is a VQA question, handled # ``/question`` mode: the whole line is a VQA question,
# synchronously (the action loop is paused so the policy is # handled synchronously (the action loop is paused so the
# not in concurrent use by the background runtime thread). # policy is not in concurrent use by the background thread).
if runtime.state.get("mode", "action") == "vlm": if runtime.state.get("mode", "action") == "question":
_run_vqa_query(runtime, line) _run_vqa_query(runtime, line)
continue continue
if lower.endswith("?"): if lower.endswith("?"):
@@ -1242,7 +1264,7 @@ def _make_state_panel_renderer(
mode_tag = ( mode_tag = (
"[green]mode: action[/]" "[green]mode: action[/]"
if run_mode == "action" if run_mode == "action"
else "[yellow]mode: vlm (action loop paused)[/]" else "[yellow]mode: question (action loop paused)[/]"
) )
console.rule( console.rule(
f"[bold]SmolVLA2[/] · {mode_label} · {mode_tag}", style="cyan" f"[bold]SmolVLA2[/] · {mode_label} · {mode_tag}", style="cyan"
@@ -1252,7 +1274,7 @@ def _make_state_panel_renderer(
# away under the timer redraw). # away under the timer redraw).
if run_mode == "action": if run_mode == "action":
console.print( console.print(
" [dim]commands:[/] [bold]/vlm[/] ask a VQA question · " " [dim]commands:[/] [bold]/question[/] ask a VQA question · "
"[bold]/help[/] all commands · [bold]stop[/] quit" "[bold]/help[/] all commands · [bold]stop[/] quit"
) )
else: else:
@@ -1335,7 +1357,7 @@ def _make_state_panel_renderer(
console.print() console.print()
if not st.get("task"): if not st.get("task"):
console.print( console.print(
" [dim]Type the task to begin. /vlm switches to VQA mode, " " [dim]Type the task to begin. /question switches to VQA mode, "
"/action resumes the robot, /help lists commands. " "/action resumes the robot, /help lists commands. "
"Type 'stop' to exit.[/]" "Type 'stop' to exit.[/]"
) )
@@ -1438,6 +1460,11 @@ def main(argv: list[str] | None = None) -> int:
flush=True, flush=True,
) )
# Startup mode prompt — choose action (run the robot) vs question
# (VQA only) *before* the task picker, so the operator sets intent
# up front. It can still be flipped any time with /action /question.
startup_mode = _select_mode_interactively()
# Always offer the startup task picker on an interactive terminal: # Always offer the startup task picker on an interactive terminal:
# list the dataset's tasks (the canonical / --task one shown as the # list the dataset's tasks (the canonical / --task one shown as the
# default) so the operator can pick another or type a custom task. # default) so the operator can pick another or type a custom task.
@@ -1518,6 +1545,8 @@ def main(argv: list[str] | None = None) -> int:
runtime.state["text_gen_min_new_tokens"] = int(getattr(args, "text_min_new_tokens", 0) or 0) runtime.state["text_gen_min_new_tokens"] = int(getattr(args, "text_min_new_tokens", 0) or 0)
runtime.state["text_gen_temperature"] = float(getattr(args, "text_temperature", 0.0) or 0.0) runtime.state["text_gen_temperature"] = float(getattr(args, "text_temperature", 0.0) or 0.0)
runtime.state["text_gen_top_p"] = float(getattr(args, "text_top_p", 1.0) or 1.0) runtime.state["text_gen_top_p"] = float(getattr(args, "text_top_p", 1.0) or 1.0)
# Apply the startup mode chosen above the task picker.
runtime.state["mode"] = startup_mode
if args.task: if args.task:
runtime.set_task(args.task) runtime.set_task(args.task)
# Seed the current subtask from the dataset so the first chunk — # Seed the current subtask from the dataset so the first chunk —
@@ -1600,17 +1629,17 @@ def _run_repl(runtime: Any, *, initial_task: str | None, max_ticks: int | None)
if lower in {"stop", "quit", "exit"}: if lower in {"stop", "quit", "exit"}:
break break
# Slash commands (/action, /vlm, /help) flip the run mode. # Slash commands (/action, /question, /help) flip the run mode.
if _handle_slash_command(runtime, line): if _handle_slash_command(runtime, line):
_redraw(last_logs) _redraw(last_logs)
continue continue
# ``/vlm`` mode: a typed line (that isn't a task command) is # ``/question`` mode: a typed line (that isn't a task
# a VQA question — run it synchronously and skip the action # command) is a VQA question — run it synchronously and skip
# pipeline tick entirely. # the action pipeline tick entirely.
if ( if (
runtime.state.get("task") runtime.state.get("task")
and runtime.state.get("mode", "action") == "vlm" and runtime.state.get("mode", "action") == "question"
and not lower.startswith(("task:", "rephrase:")) and not lower.startswith(("task:", "rephrase:"))
): ):
runtime.state["log_lines"] = [] runtime.state["log_lines"] = []