mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-19 10:40:04 +00:00
feat(smolvla2): startup mode prompt; rename /vlm mode to /question
Add a mode prompt at startup, shown before the task picker, so the operator chooses action (run the robot) vs question (VQA only) up front instead of having to discover /vlm mid-run. Also rename the VQA mode from "vlm" to the clearer "question": - state["mode"] value is now "action" | "question" - the command is /question (/vlm and /vqa kept as aliases) - panels, hints and help text updated to match handle_vqa_query now reports via both push_log and direct stdout, so VQA answers / overlay paths are visible in autonomous question mode where the panel redraw is suspended. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,7 +16,7 @@
|
||||
Reads non-blocking stdin lines, classifies each one heuristically:
|
||||
|
||||
"stop" / "quit" / "exit" → state["stop"] = True
|
||||
"/action" / "/vlm" → set state["mode"]
|
||||
"/action" / "/question" → set state["mode"]
|
||||
ends with "?" → user_vqa_query event
|
||||
starts with "task:" or first line → set runtime task
|
||||
anything else → user_interjection event
|
||||
@@ -75,14 +75,14 @@ class StdinReader:
|
||||
state["stop"] = True
|
||||
return
|
||||
|
||||
# Slash commands flip the run mode. ``/vlm`` pauses the action
|
||||
# loop (the action steps gate on ``state["mode"]``); ``/action``
|
||||
# resumes it.
|
||||
# Slash commands flip the run mode. ``/question`` pauses the
|
||||
# action loop (the action steps gate on ``state["mode"]``);
|
||||
# ``/action`` resumes it. ``/vlm`` / ``/vqa`` are kept as aliases.
|
||||
if lower in {"/action", "/act"}:
|
||||
state["mode"] = "action"
|
||||
return
|
||||
if lower in {"/vlm", "/vqa"}:
|
||||
state["mode"] = "vlm"
|
||||
if lower in {"/question", "/q", "/vlm", "/vqa"}:
|
||||
state["mode"] = "question"
|
||||
queue = state.get("action_queue")
|
||||
if hasattr(queue, "clear"):
|
||||
queue.clear()
|
||||
|
||||
@@ -33,8 +33,8 @@ Stable keys (read by multiple steps):
|
||||
events_this_tick list[str] triggers consumed this tick
|
||||
_tick Tick current tick (set by the loop)
|
||||
|
||||
mode str "action" (run the robot) | "vlm" (VQA only,
|
||||
action loop paused)
|
||||
mode str "action" (run the robot) | "question" (VQA
|
||||
only, action loop paused)
|
||||
|
||||
log_lines list[str] human-readable status lines printed each tick
|
||||
"""
|
||||
|
||||
@@ -93,7 +93,7 @@ def make_state_panel(state: dict[str, Any]) -> Any:
|
||||
table.add_row("", footer)
|
||||
run_mode = state.get("mode", "action")
|
||||
mode_tag = (
|
||||
"[green]action[/]" if run_mode == "action" else "[yellow]vlm (paused)[/]"
|
||||
"[green]action[/]" if run_mode == "action" else "[yellow]question (paused)[/]"
|
||||
)
|
||||
return Panel(
|
||||
table,
|
||||
|
||||
@@ -272,14 +272,24 @@ def handle_vqa_query(
|
||||
"""Run one interactive VQA question end to end.
|
||||
|
||||
Called synchronously from the input layer while the runtime is in
|
||||
``/vlm`` mode (the action loop is gated off, so the policy is not in
|
||||
concurrent use). All progress is reported via :func:`push_log` so it
|
||||
shows up in the state panel's scrollback.
|
||||
``/question`` mode (the action loop is gated off, so the policy is
|
||||
not in concurrent use). Progress is reported via both
|
||||
:func:`push_log` (REPL panel scrollback) and ``print_fn`` (direct
|
||||
stdout) — in autonomous question mode the panel redraw is suspended,
|
||||
so the direct print is what the operator actually sees.
|
||||
"""
|
||||
from .steps import _generate_with_policy, _msgs_for_vqa # noqa: PLC0415
|
||||
|
||||
def report(line: str) -> None:
|
||||
"""Surface a line both to the panel scrollback and to stdout."""
|
||||
push_log(state, line)
|
||||
try:
|
||||
print_fn(line)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
if policy is None or not hasattr(policy, "select_message"):
|
||||
push_log(state, " [warn] vqa: policy has no select_message — skipping")
|
||||
report(" [warn] vqa: policy has no select_message — skipping")
|
||||
return
|
||||
|
||||
observation: dict | None = None
|
||||
@@ -294,11 +304,11 @@ def handle_vqa_query(
|
||||
if cameras:
|
||||
chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn)
|
||||
if chosen is None:
|
||||
push_log(state, " [info] vqa cancelled — no camera selected")
|
||||
report(" [info] vqa cancelled — no camera selected")
|
||||
return
|
||||
push_log(state, f" vqa camera: {camera_short_name(chosen)}")
|
||||
report(f" vqa camera: {camera_short_name(chosen)}")
|
||||
else:
|
||||
push_log(state, " [info] vqa: no camera available — answering text-only")
|
||||
report(" [info] vqa: no camera available — answering text-only")
|
||||
|
||||
# Ground the question on the chosen camera only — filter the
|
||||
# observation to that one image (+ proprio state) so the VLM
|
||||
@@ -317,23 +327,23 @@ def handle_vqa_query(
|
||||
label="vqa gen",
|
||||
)
|
||||
if not answer:
|
||||
push_log(state, " [info] vqa gen returned empty")
|
||||
report(" [info] vqa gen returned empty")
|
||||
return
|
||||
push_log(state, f" vqa: {answer}")
|
||||
report(f" vqa: {answer}")
|
||||
|
||||
parsed = parse_vqa_answer(answer)
|
||||
if not answer_has_overlay(parsed):
|
||||
if parsed is None:
|
||||
push_log(state, " [info] vqa answer is not JSON — no overlay")
|
||||
report(" [info] vqa answer is not JSON — no overlay")
|
||||
return
|
||||
if observation is None or chosen is None:
|
||||
push_log(state, " [info] no camera image — cannot draw overlay")
|
||||
report(" [info] no camera image — cannot draw overlay")
|
||||
return
|
||||
try:
|
||||
pil = observation_image_to_pil(observation[chosen])
|
||||
overlay = draw_vqa_overlay(pil, parsed)
|
||||
path = save_and_open_overlay(overlay)
|
||||
push_log(state, f" vqa overlay saved: {path}")
|
||||
report(f" vqa overlay saved: {path}")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG))
|
||||
push_log(state, f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
|
||||
report(f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
|
||||
|
||||
@@ -965,12 +965,33 @@ def _select_task_interactively(ds_meta: Any, current_task: str | None) -> str |
|
||||
return raw
|
||||
|
||||
|
||||
def _select_mode_interactively() -> str:
|
||||
"""Ask which mode to start in: ``action`` (run the robot) or
|
||||
``question`` (VQA only, robot paused).
|
||||
|
||||
Shown at startup, before the task picker. Non-TTY / scripted runs
|
||||
default to ``action`` so existing pipelines are unaffected.
|
||||
"""
|
||||
if not sys.stdin.isatty():
|
||||
return "action"
|
||||
print("[smolvla2] Start in which mode?", flush=True)
|
||||
print(" [1] action — run the robot autonomously (default)", flush=True)
|
||||
print(" [2] question — ask the VLM questions (VQA); robot stays paused", flush=True)
|
||||
try:
|
||||
raw = input("mode> (Enter = action) ").strip().lower()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
return "action"
|
||||
if raw in {"2", "question", "q", "/question", "/q", "vlm", "vqa", "/vlm", "/vqa"}:
|
||||
return "question"
|
||||
return "action"
|
||||
|
||||
|
||||
def _print_runtime_help() -> None:
|
||||
"""Print the slash-command reference."""
|
||||
print(
|
||||
"[smolvla2] commands:\n"
|
||||
" /action run the robot (default mode)\n"
|
||||
" /vlm pause the action loop; typed lines become VQA questions\n"
|
||||
" /question pause the action loop; typed lines become VQA questions\n"
|
||||
" /help show this help\n"
|
||||
" task: <text> switch task (clears plan / memory / subtask)\n"
|
||||
" rephrase: <text> reword the task in place\n"
|
||||
@@ -980,24 +1001,25 @@ def _print_runtime_help() -> None:
|
||||
|
||||
|
||||
def _handle_slash_command(runtime: Any, line: str) -> bool:
|
||||
"""Handle ``/action`` / ``/vlm`` / ``/help``.
|
||||
"""Handle ``/action`` / ``/question`` / ``/help``.
|
||||
|
||||
Returns ``True`` when ``line`` was a recognised command (and was
|
||||
consumed), ``False`` otherwise.
|
||||
``/vlm`` and ``/vqa`` are kept as aliases for ``/question``. Returns
|
||||
``True`` when ``line`` was a recognised command (and was consumed),
|
||||
``False`` otherwise.
|
||||
"""
|
||||
cmd = line.strip().lower()
|
||||
if cmd in {"/action", "/act"}:
|
||||
runtime.state["mode"] = "action"
|
||||
print("[smolvla2] mode: action — robot running", flush=True)
|
||||
return True
|
||||
if cmd in {"/vlm", "/vqa"}:
|
||||
runtime.state["mode"] = "vlm"
|
||||
if cmd in {"/question", "/q", "/vlm", "/vqa"}:
|
||||
runtime.state["mode"] = "question"
|
||||
# Drop any queued chunk so no stale action fires while paused.
|
||||
queue = runtime.state.get("action_queue")
|
||||
if hasattr(queue, "clear"):
|
||||
queue.clear()
|
||||
print(
|
||||
"[smolvla2] mode: vlm — action loop paused; type VQA questions",
|
||||
"[smolvla2] mode: question — action loop paused; type VQA questions",
|
||||
flush=True,
|
||||
)
|
||||
return True
|
||||
@@ -1010,8 +1032,8 @@ def _handle_slash_command(runtime: Any, line: str) -> bool:
|
||||
def _run_vqa_query(runtime: Any, question: str) -> None:
|
||||
"""Run one interactive VQA question against the runtime's policy.
|
||||
|
||||
Used by both loops when in ``/vlm`` mode — the action loop is paused
|
||||
so the policy is free for a synchronous VQA call.
|
||||
Used by both loops when in ``/question`` mode — the action loop is
|
||||
paused so the policy is free for a synchronous VQA call.
|
||||
"""
|
||||
from lerobot.policies.smolvla2.inference.vqa import handle_vqa_query # noqa: PLC0415
|
||||
|
||||
@@ -1089,7 +1111,7 @@ def _run_autonomous(
|
||||
redraw()
|
||||
print(
|
||||
" [autonomous] type interjections / '?' questions on stdin; "
|
||||
"/vlm for VQA mode, /action to resume, /help for commands, "
|
||||
"/question for VQA mode, /action to resume, /help for commands, "
|
||||
"'stop' or Ctrl+C to quit",
|
||||
flush=True,
|
||||
)
|
||||
@@ -1133,18 +1155,18 @@ def _run_autonomous(
|
||||
lower = line.lower()
|
||||
if lower in {"stop", "quit", "exit"}:
|
||||
break
|
||||
# Slash commands (/action, /vlm, /help) flip the run mode.
|
||||
# Slash commands (/action, /question, /help) flip the run mode.
|
||||
if _handle_slash_command(runtime, line):
|
||||
# Redraw once so the panel reflects the new mode. In
|
||||
# ``/vlm`` the timer redraw is now suspended, so this is
|
||||
# the last clear — the VQA prompt below stays stable.
|
||||
# ``/question`` the timer redraw is now suspended, so
|
||||
# this is the last clear — the VQA prompt stays stable.
|
||||
try:
|
||||
redraw()
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
if runtime.state.get("mode") == "vlm":
|
||||
if runtime.state.get("mode") == "question":
|
||||
print(
|
||||
" [vlm] type a VQA question and press Enter; "
|
||||
" [question] type a VQA question and press Enter; "
|
||||
"/action to resume the robot.",
|
||||
flush=True,
|
||||
)
|
||||
@@ -1187,10 +1209,10 @@ def _run_autonomous(
|
||||
if not runtime.state.get("task"):
|
||||
runtime.set_task(line)
|
||||
continue
|
||||
# ``/vlm`` mode: the whole line is a VQA question, handled
|
||||
# synchronously (the action loop is paused so the policy is
|
||||
# not in concurrent use by the background runtime thread).
|
||||
if runtime.state.get("mode", "action") == "vlm":
|
||||
# ``/question`` mode: the whole line is a VQA question,
|
||||
# handled synchronously (the action loop is paused so the
|
||||
# policy is not in concurrent use by the background thread).
|
||||
if runtime.state.get("mode", "action") == "question":
|
||||
_run_vqa_query(runtime, line)
|
||||
continue
|
||||
if lower.endswith("?"):
|
||||
@@ -1242,7 +1264,7 @@ def _make_state_panel_renderer(
|
||||
mode_tag = (
|
||||
"[green]mode: action[/]"
|
||||
if run_mode == "action"
|
||||
else "[yellow]mode: vlm (action loop paused)[/]"
|
||||
else "[yellow]mode: question (action loop paused)[/]"
|
||||
)
|
||||
console.rule(
|
||||
f"[bold]SmolVLA2[/] · {mode_label} · {mode_tag}", style="cyan"
|
||||
@@ -1252,7 +1274,7 @@ def _make_state_panel_renderer(
|
||||
# away under the timer redraw).
|
||||
if run_mode == "action":
|
||||
console.print(
|
||||
" [dim]commands:[/] [bold]/vlm[/] ask a VQA question · "
|
||||
" [dim]commands:[/] [bold]/question[/] ask a VQA question · "
|
||||
"[bold]/help[/] all commands · [bold]stop[/] quit"
|
||||
)
|
||||
else:
|
||||
@@ -1335,7 +1357,7 @@ def _make_state_panel_renderer(
|
||||
console.print()
|
||||
if not st.get("task"):
|
||||
console.print(
|
||||
" [dim]Type the task to begin. /vlm switches to VQA mode, "
|
||||
" [dim]Type the task to begin. /question switches to VQA mode, "
|
||||
"/action resumes the robot, /help lists commands. "
|
||||
"Type 'stop' to exit.[/]"
|
||||
)
|
||||
@@ -1438,6 +1460,11 @@ def main(argv: list[str] | None = None) -> int:
|
||||
flush=True,
|
||||
)
|
||||
|
||||
# Startup mode prompt — choose action (run the robot) vs question
|
||||
# (VQA only) *before* the task picker, so the operator sets intent
|
||||
# up front. It can still be flipped any time with /action /question.
|
||||
startup_mode = _select_mode_interactively()
|
||||
|
||||
# Always offer the startup task picker on an interactive terminal:
|
||||
# list the dataset's tasks (the canonical / --task one shown as the
|
||||
# default) so the operator can pick another or type a custom task.
|
||||
@@ -1518,6 +1545,8 @@ def main(argv: list[str] | None = None) -> int:
|
||||
runtime.state["text_gen_min_new_tokens"] = int(getattr(args, "text_min_new_tokens", 0) or 0)
|
||||
runtime.state["text_gen_temperature"] = float(getattr(args, "text_temperature", 0.0) or 0.0)
|
||||
runtime.state["text_gen_top_p"] = float(getattr(args, "text_top_p", 1.0) or 1.0)
|
||||
# Apply the startup mode chosen above the task picker.
|
||||
runtime.state["mode"] = startup_mode
|
||||
if args.task:
|
||||
runtime.set_task(args.task)
|
||||
# Seed the current subtask from the dataset so the first chunk —
|
||||
@@ -1600,17 +1629,17 @@ def _run_repl(runtime: Any, *, initial_task: str | None, max_ticks: int | None)
|
||||
if lower in {"stop", "quit", "exit"}:
|
||||
break
|
||||
|
||||
# Slash commands (/action, /vlm, /help) flip the run mode.
|
||||
# Slash commands (/action, /question, /help) flip the run mode.
|
||||
if _handle_slash_command(runtime, line):
|
||||
_redraw(last_logs)
|
||||
continue
|
||||
|
||||
# ``/vlm`` mode: a typed line (that isn't a task command) is
|
||||
# a VQA question — run it synchronously and skip the action
|
||||
# pipeline tick entirely.
|
||||
# ``/question`` mode: a typed line (that isn't a task
|
||||
# command) is a VQA question — run it synchronously and skip
|
||||
# the action pipeline tick entirely.
|
||||
if (
|
||||
runtime.state.get("task")
|
||||
and runtime.state.get("mode", "action") == "vlm"
|
||||
and runtime.state.get("mode", "action") == "question"
|
||||
and not lower.startswith(("task:", "rephrase:"))
|
||||
):
|
||||
runtime.state["log_lines"] = []
|
||||
|
||||
Reference in New Issue
Block a user