feat(smolvla2): startup mode prompt; rename /vlm mode to /question

Add a mode prompt at startup, shown before the task picker, so the
operator chooses action (run the robot) vs question (VQA only) up front
instead of having to discover /vlm mid-run.

Also rename the VQA mode from "vlm" to the clearer "question":
- state["mode"] value is now "action" | "question"
- the command is /question (/vlm and /vqa kept as aliases)
- panels, hints and help text updated to match

handle_vqa_query now reports via both push_log and direct stdout, so
VQA answers / overlay paths are visible in autonomous question mode
where the panel redraw is suspended.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-18 14:17:03 +02:00
parent a9cea3e8dd
commit 15229468d0
5 changed files with 88 additions and 49 deletions
@@ -16,7 +16,7 @@
Reads non-blocking stdin lines, classifies each one heuristically:
"stop" / "quit" / "exit" → state["stop"] = True
"/action" / "/vlm" → set state["mode"]
"/action" / "/question" → set state["mode"]
ends with "?" → user_vqa_query event
starts with "task:" or first line → set runtime task
anything else → user_interjection event
@@ -75,14 +75,14 @@ class StdinReader:
state["stop"] = True
return
# Slash commands flip the run mode. ``/vlm`` pauses the action
# loop (the action steps gate on ``state["mode"]``); ``/action``
# resumes it.
# Slash commands flip the run mode. ``/question`` pauses the
# action loop (the action steps gate on ``state["mode"]``);
# ``/action`` resumes it. ``/vlm`` / ``/vqa`` are kept as aliases.
if lower in {"/action", "/act"}:
state["mode"] = "action"
return
if lower in {"/vlm", "/vqa"}:
state["mode"] = "vlm"
if lower in {"/question", "/q", "/vlm", "/vqa"}:
state["mode"] = "question"
queue = state.get("action_queue")
if hasattr(queue, "clear"):
queue.clear()
@@ -33,8 +33,8 @@ Stable keys (read by multiple steps):
events_this_tick list[str] triggers consumed this tick
_tick Tick current tick (set by the loop)
mode str "action" (run the robot) | "vlm" (VQA only,
action loop paused)
mode str "action" (run the robot) | "question" (VQA
only, action loop paused)
log_lines list[str] human-readable status lines printed each tick
"""
@@ -93,7 +93,7 @@ def make_state_panel(state: dict[str, Any]) -> Any:
table.add_row("", footer)
run_mode = state.get("mode", "action")
mode_tag = (
"[green]action[/]" if run_mode == "action" else "[yellow]vlm (paused)[/]"
"[green]action[/]" if run_mode == "action" else "[yellow]question (paused)[/]"
)
return Panel(
table,
+23 -13
View File
@@ -272,14 +272,24 @@ def handle_vqa_query(
"""Run one interactive VQA question end to end.
Called synchronously from the input layer while the runtime is in
``/vlm`` mode (the action loop is gated off, so the policy is not in
concurrent use). All progress is reported via :func:`push_log` so it
shows up in the state panel's scrollback.
``/question`` mode (the action loop is gated off, so the policy is
not in concurrent use). Progress is reported via both
:func:`push_log` (REPL panel scrollback) and ``print_fn`` (direct
stdout) in autonomous question mode the panel redraw is suspended,
so the direct print is what the operator actually sees.
"""
from .steps import _generate_with_policy, _msgs_for_vqa # noqa: PLC0415
def report(line: str) -> None:
"""Surface a line both to the panel scrollback and to stdout."""
push_log(state, line)
try:
print_fn(line)
except Exception: # noqa: BLE001
pass
if policy is None or not hasattr(policy, "select_message"):
push_log(state, " [warn] vqa: policy has no select_message — skipping")
report(" [warn] vqa: policy has no select_message — skipping")
return
observation: dict | None = None
@@ -294,11 +304,11 @@ def handle_vqa_query(
if cameras:
chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn)
if chosen is None:
push_log(state, " [info] vqa cancelled — no camera selected")
report(" [info] vqa cancelled — no camera selected")
return
push_log(state, f" vqa camera: {camera_short_name(chosen)}")
report(f" vqa camera: {camera_short_name(chosen)}")
else:
push_log(state, " [info] vqa: no camera available — answering text-only")
report(" [info] vqa: no camera available — answering text-only")
# Ground the question on the chosen camera only — filter the
# observation to that one image (+ proprio state) so the VLM
@@ -317,23 +327,23 @@ def handle_vqa_query(
label="vqa gen",
)
if not answer:
push_log(state, " [info] vqa gen returned empty")
report(" [info] vqa gen returned empty")
return
push_log(state, f" vqa: {answer}")
report(f" vqa: {answer}")
parsed = parse_vqa_answer(answer)
if not answer_has_overlay(parsed):
if parsed is None:
push_log(state, " [info] vqa answer is not JSON — no overlay")
report(" [info] vqa answer is not JSON — no overlay")
return
if observation is None or chosen is None:
push_log(state, " [info] no camera image — cannot draw overlay")
report(" [info] no camera image — cannot draw overlay")
return
try:
pil = observation_image_to_pil(observation[chosen])
overlay = draw_vqa_overlay(pil, parsed)
path = save_and_open_overlay(overlay)
push_log(state, f" vqa overlay saved: {path}")
report(f" vqa overlay saved: {path}")
except Exception as exc: # noqa: BLE001
logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG))
push_log(state, f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
report(f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
+56 -27
View File
@@ -965,12 +965,33 @@ def _select_task_interactively(ds_meta: Any, current_task: str | None) -> str |
return raw
def _select_mode_interactively() -> str:
"""Ask which mode to start in: ``action`` (run the robot) or
``question`` (VQA only, robot paused).
Shown at startup, before the task picker. Non-TTY / scripted runs
default to ``action`` so existing pipelines are unaffected.
"""
if not sys.stdin.isatty():
return "action"
print("[smolvla2] Start in which mode?", flush=True)
print(" [1] action — run the robot autonomously (default)", flush=True)
print(" [2] question — ask the VLM questions (VQA); robot stays paused", flush=True)
try:
raw = input("mode> (Enter = action) ").strip().lower()
except (EOFError, KeyboardInterrupt):
return "action"
if raw in {"2", "question", "q", "/question", "/q", "vlm", "vqa", "/vlm", "/vqa"}:
return "question"
return "action"
def _print_runtime_help() -> None:
"""Print the slash-command reference."""
print(
"[smolvla2] commands:\n"
" /action run the robot (default mode)\n"
" /vlm pause the action loop; typed lines become VQA questions\n"
" /question pause the action loop; typed lines become VQA questions\n"
" /help show this help\n"
" task: <text> switch task (clears plan / memory / subtask)\n"
" rephrase: <text> reword the task in place\n"
@@ -980,24 +1001,25 @@ def _print_runtime_help() -> None:
def _handle_slash_command(runtime: Any, line: str) -> bool:
"""Handle ``/action`` / ``/vlm`` / ``/help``.
"""Handle ``/action`` / ``/question`` / ``/help``.
Returns ``True`` when ``line`` was a recognised command (and was
consumed), ``False`` otherwise.
``/vlm`` and ``/vqa`` are kept as aliases for ``/question``. Returns
``True`` when ``line`` was a recognised command (and was consumed),
``False`` otherwise.
"""
cmd = line.strip().lower()
if cmd in {"/action", "/act"}:
runtime.state["mode"] = "action"
print("[smolvla2] mode: action — robot running", flush=True)
return True
if cmd in {"/vlm", "/vqa"}:
runtime.state["mode"] = "vlm"
if cmd in {"/question", "/q", "/vlm", "/vqa"}:
runtime.state["mode"] = "question"
# Drop any queued chunk so no stale action fires while paused.
queue = runtime.state.get("action_queue")
if hasattr(queue, "clear"):
queue.clear()
print(
"[smolvla2] mode: vlm — action loop paused; type VQA questions",
"[smolvla2] mode: question — action loop paused; type VQA questions",
flush=True,
)
return True
@@ -1010,8 +1032,8 @@ def _handle_slash_command(runtime: Any, line: str) -> bool:
def _run_vqa_query(runtime: Any, question: str) -> None:
"""Run one interactive VQA question against the runtime's policy.
Used by both loops when in ``/vlm`` mode the action loop is paused
so the policy is free for a synchronous VQA call.
Used by both loops when in ``/question`` mode the action loop is
paused so the policy is free for a synchronous VQA call.
"""
from lerobot.policies.smolvla2.inference.vqa import handle_vqa_query # noqa: PLC0415
@@ -1089,7 +1111,7 @@ def _run_autonomous(
redraw()
print(
" [autonomous] type interjections / '?' questions on stdin; "
"/vlm for VQA mode, /action to resume, /help for commands, "
"/question for VQA mode, /action to resume, /help for commands, "
"'stop' or Ctrl+C to quit",
flush=True,
)
@@ -1133,18 +1155,18 @@ def _run_autonomous(
lower = line.lower()
if lower in {"stop", "quit", "exit"}:
break
# Slash commands (/action, /vlm, /help) flip the run mode.
# Slash commands (/action, /question, /help) flip the run mode.
if _handle_slash_command(runtime, line):
# Redraw once so the panel reflects the new mode. In
# ``/vlm`` the timer redraw is now suspended, so this is
# the last clear — the VQA prompt below stays stable.
# ``/question`` the timer redraw is now suspended, so
# this is the last clear — the VQA prompt stays stable.
try:
redraw()
except Exception: # noqa: BLE001
pass
if runtime.state.get("mode") == "vlm":
if runtime.state.get("mode") == "question":
print(
" [vlm] type a VQA question and press Enter; "
" [question] type a VQA question and press Enter; "
"/action to resume the robot.",
flush=True,
)
@@ -1187,10 +1209,10 @@ def _run_autonomous(
if not runtime.state.get("task"):
runtime.set_task(line)
continue
# ``/vlm`` mode: the whole line is a VQA question, handled
# synchronously (the action loop is paused so the policy is
# not in concurrent use by the background runtime thread).
if runtime.state.get("mode", "action") == "vlm":
# ``/question`` mode: the whole line is a VQA question,
# handled synchronously (the action loop is paused so the
# policy is not in concurrent use by the background thread).
if runtime.state.get("mode", "action") == "question":
_run_vqa_query(runtime, line)
continue
if lower.endswith("?"):
@@ -1242,7 +1264,7 @@ def _make_state_panel_renderer(
mode_tag = (
"[green]mode: action[/]"
if run_mode == "action"
else "[yellow]mode: vlm (action loop paused)[/]"
else "[yellow]mode: question (action loop paused)[/]"
)
console.rule(
f"[bold]SmolVLA2[/] · {mode_label} · {mode_tag}", style="cyan"
@@ -1252,7 +1274,7 @@ def _make_state_panel_renderer(
# away under the timer redraw).
if run_mode == "action":
console.print(
" [dim]commands:[/] [bold]/vlm[/] ask a VQA question · "
" [dim]commands:[/] [bold]/question[/] ask a VQA question · "
"[bold]/help[/] all commands · [bold]stop[/] quit"
)
else:
@@ -1335,7 +1357,7 @@ def _make_state_panel_renderer(
console.print()
if not st.get("task"):
console.print(
" [dim]Type the task to begin. /vlm switches to VQA mode, "
" [dim]Type the task to begin. /question switches to VQA mode, "
"/action resumes the robot, /help lists commands. "
"Type 'stop' to exit.[/]"
)
@@ -1438,6 +1460,11 @@ def main(argv: list[str] | None = None) -> int:
flush=True,
)
# Startup mode prompt — choose action (run the robot) vs question
# (VQA only) *before* the task picker, so the operator sets intent
# up front. It can still be flipped any time with /action /question.
startup_mode = _select_mode_interactively()
# Always offer the startup task picker on an interactive terminal:
# list the dataset's tasks (the canonical / --task one shown as the
# default) so the operator can pick another or type a custom task.
@@ -1518,6 +1545,8 @@ def main(argv: list[str] | None = None) -> int:
runtime.state["text_gen_min_new_tokens"] = int(getattr(args, "text_min_new_tokens", 0) or 0)
runtime.state["text_gen_temperature"] = float(getattr(args, "text_temperature", 0.0) or 0.0)
runtime.state["text_gen_top_p"] = float(getattr(args, "text_top_p", 1.0) or 1.0)
# Apply the startup mode chosen above the task picker.
runtime.state["mode"] = startup_mode
if args.task:
runtime.set_task(args.task)
# Seed the current subtask from the dataset so the first chunk —
@@ -1600,17 +1629,17 @@ def _run_repl(runtime: Any, *, initial_task: str | None, max_ticks: int | None)
if lower in {"stop", "quit", "exit"}:
break
# Slash commands (/action, /vlm, /help) flip the run mode.
# Slash commands (/action, /question, /help) flip the run mode.
if _handle_slash_command(runtime, line):
_redraw(last_logs)
continue
# ``/vlm`` mode: a typed line (that isn't a task command) is
# a VQA question — run it synchronously and skip the action
# pipeline tick entirely.
# ``/question`` mode: a typed line (that isn't a task
# command) is a VQA question — run it synchronously and skip
# the action pipeline tick entirely.
if (
runtime.state.get("task")
and runtime.state.get("mode", "action") == "vlm"
and runtime.state.get("mode", "action") == "question"
and not lower.startswith(("task:", "rephrase:"))
):
runtime.state["log_lines"] = []