mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 12:09:42 +00:00
feat(smolvla2): startup mode prompt; rename /vlm mode to /question
Add a mode prompt at startup, shown before the task picker, so the operator chooses action (run the robot) vs question (VQA only) up front instead of having to discover /vlm mid-run. Also rename the VQA mode from "vlm" to the clearer "question": - state["mode"] value is now "action" | "question" - the command is /question (/vlm and /vqa kept as aliases) - panels, hints and help text updated to match handle_vqa_query now reports via both push_log and direct stdout, so VQA answers / overlay paths are visible in autonomous question mode where the panel redraw is suspended. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,7 +16,7 @@
|
|||||||
Reads non-blocking stdin lines, classifies each one heuristically:
|
Reads non-blocking stdin lines, classifies each one heuristically:
|
||||||
|
|
||||||
"stop" / "quit" / "exit" → state["stop"] = True
|
"stop" / "quit" / "exit" → state["stop"] = True
|
||||||
"/action" / "/vlm" → set state["mode"]
|
"/action" / "/question" → set state["mode"]
|
||||||
ends with "?" → user_vqa_query event
|
ends with "?" → user_vqa_query event
|
||||||
starts with "task:" or first line → set runtime task
|
starts with "task:" or first line → set runtime task
|
||||||
anything else → user_interjection event
|
anything else → user_interjection event
|
||||||
@@ -75,14 +75,14 @@ class StdinReader:
|
|||||||
state["stop"] = True
|
state["stop"] = True
|
||||||
return
|
return
|
||||||
|
|
||||||
# Slash commands flip the run mode. ``/vlm`` pauses the action
|
# Slash commands flip the run mode. ``/question`` pauses the
|
||||||
# loop (the action steps gate on ``state["mode"]``); ``/action``
|
# action loop (the action steps gate on ``state["mode"]``);
|
||||||
# resumes it.
|
# ``/action`` resumes it. ``/vlm`` / ``/vqa`` are kept as aliases.
|
||||||
if lower in {"/action", "/act"}:
|
if lower in {"/action", "/act"}:
|
||||||
state["mode"] = "action"
|
state["mode"] = "action"
|
||||||
return
|
return
|
||||||
if lower in {"/vlm", "/vqa"}:
|
if lower in {"/question", "/q", "/vlm", "/vqa"}:
|
||||||
state["mode"] = "vlm"
|
state["mode"] = "question"
|
||||||
queue = state.get("action_queue")
|
queue = state.get("action_queue")
|
||||||
if hasattr(queue, "clear"):
|
if hasattr(queue, "clear"):
|
||||||
queue.clear()
|
queue.clear()
|
||||||
|
|||||||
@@ -33,8 +33,8 @@ Stable keys (read by multiple steps):
|
|||||||
events_this_tick list[str] triggers consumed this tick
|
events_this_tick list[str] triggers consumed this tick
|
||||||
_tick Tick current tick (set by the loop)
|
_tick Tick current tick (set by the loop)
|
||||||
|
|
||||||
mode str "action" (run the robot) | "vlm" (VQA only,
|
mode str "action" (run the robot) | "question" (VQA
|
||||||
action loop paused)
|
only, action loop paused)
|
||||||
|
|
||||||
log_lines list[str] human-readable status lines printed each tick
|
log_lines list[str] human-readable status lines printed each tick
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ def make_state_panel(state: dict[str, Any]) -> Any:
|
|||||||
table.add_row("", footer)
|
table.add_row("", footer)
|
||||||
run_mode = state.get("mode", "action")
|
run_mode = state.get("mode", "action")
|
||||||
mode_tag = (
|
mode_tag = (
|
||||||
"[green]action[/]" if run_mode == "action" else "[yellow]vlm (paused)[/]"
|
"[green]action[/]" if run_mode == "action" else "[yellow]question (paused)[/]"
|
||||||
)
|
)
|
||||||
return Panel(
|
return Panel(
|
||||||
table,
|
table,
|
||||||
|
|||||||
@@ -272,14 +272,24 @@ def handle_vqa_query(
|
|||||||
"""Run one interactive VQA question end to end.
|
"""Run one interactive VQA question end to end.
|
||||||
|
|
||||||
Called synchronously from the input layer while the runtime is in
|
Called synchronously from the input layer while the runtime is in
|
||||||
``/vlm`` mode (the action loop is gated off, so the policy is not in
|
``/question`` mode (the action loop is gated off, so the policy is
|
||||||
concurrent use). All progress is reported via :func:`push_log` so it
|
not in concurrent use). Progress is reported via both
|
||||||
shows up in the state panel's scrollback.
|
:func:`push_log` (REPL panel scrollback) and ``print_fn`` (direct
|
||||||
|
stdout) — in autonomous question mode the panel redraw is suspended,
|
||||||
|
so the direct print is what the operator actually sees.
|
||||||
"""
|
"""
|
||||||
from .steps import _generate_with_policy, _msgs_for_vqa # noqa: PLC0415
|
from .steps import _generate_with_policy, _msgs_for_vqa # noqa: PLC0415
|
||||||
|
|
||||||
|
def report(line: str) -> None:
|
||||||
|
"""Surface a line both to the panel scrollback and to stdout."""
|
||||||
|
push_log(state, line)
|
||||||
|
try:
|
||||||
|
print_fn(line)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
|
||||||
if policy is None or not hasattr(policy, "select_message"):
|
if policy is None or not hasattr(policy, "select_message"):
|
||||||
push_log(state, " [warn] vqa: policy has no select_message — skipping")
|
report(" [warn] vqa: policy has no select_message — skipping")
|
||||||
return
|
return
|
||||||
|
|
||||||
observation: dict | None = None
|
observation: dict | None = None
|
||||||
@@ -294,11 +304,11 @@ def handle_vqa_query(
|
|||||||
if cameras:
|
if cameras:
|
||||||
chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn)
|
chosen = prompt_camera_choice(cameras, input_fn=input_fn, print_fn=print_fn)
|
||||||
if chosen is None:
|
if chosen is None:
|
||||||
push_log(state, " [info] vqa cancelled — no camera selected")
|
report(" [info] vqa cancelled — no camera selected")
|
||||||
return
|
return
|
||||||
push_log(state, f" vqa camera: {camera_short_name(chosen)}")
|
report(f" vqa camera: {camera_short_name(chosen)}")
|
||||||
else:
|
else:
|
||||||
push_log(state, " [info] vqa: no camera available — answering text-only")
|
report(" [info] vqa: no camera available — answering text-only")
|
||||||
|
|
||||||
# Ground the question on the chosen camera only — filter the
|
# Ground the question on the chosen camera only — filter the
|
||||||
# observation to that one image (+ proprio state) so the VLM
|
# observation to that one image (+ proprio state) so the VLM
|
||||||
@@ -317,23 +327,23 @@ def handle_vqa_query(
|
|||||||
label="vqa gen",
|
label="vqa gen",
|
||||||
)
|
)
|
||||||
if not answer:
|
if not answer:
|
||||||
push_log(state, " [info] vqa gen returned empty")
|
report(" [info] vqa gen returned empty")
|
||||||
return
|
return
|
||||||
push_log(state, f" vqa: {answer}")
|
report(f" vqa: {answer}")
|
||||||
|
|
||||||
parsed = parse_vqa_answer(answer)
|
parsed = parse_vqa_answer(answer)
|
||||||
if not answer_has_overlay(parsed):
|
if not answer_has_overlay(parsed):
|
||||||
if parsed is None:
|
if parsed is None:
|
||||||
push_log(state, " [info] vqa answer is not JSON — no overlay")
|
report(" [info] vqa answer is not JSON — no overlay")
|
||||||
return
|
return
|
||||||
if observation is None or chosen is None:
|
if observation is None or chosen is None:
|
||||||
push_log(state, " [info] no camera image — cannot draw overlay")
|
report(" [info] no camera image — cannot draw overlay")
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
pil = observation_image_to_pil(observation[chosen])
|
pil = observation_image_to_pil(observation[chosen])
|
||||||
overlay = draw_vqa_overlay(pil, parsed)
|
overlay = draw_vqa_overlay(pil, parsed)
|
||||||
path = save_and_open_overlay(overlay)
|
path = save_and_open_overlay(overlay)
|
||||||
push_log(state, f" vqa overlay saved: {path}")
|
report(f" vqa overlay saved: {path}")
|
||||||
except Exception as exc: # noqa: BLE001
|
except Exception as exc: # noqa: BLE001
|
||||||
logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG))
|
logger.warning("vqa overlay failed: %s", exc, exc_info=logger.isEnabledFor(logging.DEBUG))
|
||||||
push_log(state, f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
|
report(f" [warn] vqa overlay failed: {type(exc).__name__}: {exc}")
|
||||||
|
|||||||
@@ -965,12 +965,33 @@ def _select_task_interactively(ds_meta: Any, current_task: str | None) -> str |
|
|||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
|
||||||
|
def _select_mode_interactively() -> str:
|
||||||
|
"""Ask which mode to start in: ``action`` (run the robot) or
|
||||||
|
``question`` (VQA only, robot paused).
|
||||||
|
|
||||||
|
Shown at startup, before the task picker. Non-TTY / scripted runs
|
||||||
|
default to ``action`` so existing pipelines are unaffected.
|
||||||
|
"""
|
||||||
|
if not sys.stdin.isatty():
|
||||||
|
return "action"
|
||||||
|
print("[smolvla2] Start in which mode?", flush=True)
|
||||||
|
print(" [1] action — run the robot autonomously (default)", flush=True)
|
||||||
|
print(" [2] question — ask the VLM questions (VQA); robot stays paused", flush=True)
|
||||||
|
try:
|
||||||
|
raw = input("mode> (Enter = action) ").strip().lower()
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
return "action"
|
||||||
|
if raw in {"2", "question", "q", "/question", "/q", "vlm", "vqa", "/vlm", "/vqa"}:
|
||||||
|
return "question"
|
||||||
|
return "action"
|
||||||
|
|
||||||
|
|
||||||
def _print_runtime_help() -> None:
|
def _print_runtime_help() -> None:
|
||||||
"""Print the slash-command reference."""
|
"""Print the slash-command reference."""
|
||||||
print(
|
print(
|
||||||
"[smolvla2] commands:\n"
|
"[smolvla2] commands:\n"
|
||||||
" /action run the robot (default mode)\n"
|
" /action run the robot (default mode)\n"
|
||||||
" /vlm pause the action loop; typed lines become VQA questions\n"
|
" /question pause the action loop; typed lines become VQA questions\n"
|
||||||
" /help show this help\n"
|
" /help show this help\n"
|
||||||
" task: <text> switch task (clears plan / memory / subtask)\n"
|
" task: <text> switch task (clears plan / memory / subtask)\n"
|
||||||
" rephrase: <text> reword the task in place\n"
|
" rephrase: <text> reword the task in place\n"
|
||||||
@@ -980,24 +1001,25 @@ def _print_runtime_help() -> None:
|
|||||||
|
|
||||||
|
|
||||||
def _handle_slash_command(runtime: Any, line: str) -> bool:
|
def _handle_slash_command(runtime: Any, line: str) -> bool:
|
||||||
"""Handle ``/action`` / ``/vlm`` / ``/help``.
|
"""Handle ``/action`` / ``/question`` / ``/help``.
|
||||||
|
|
||||||
Returns ``True`` when ``line`` was a recognised command (and was
|
``/vlm`` and ``/vqa`` are kept as aliases for ``/question``. Returns
|
||||||
consumed), ``False`` otherwise.
|
``True`` when ``line`` was a recognised command (and was consumed),
|
||||||
|
``False`` otherwise.
|
||||||
"""
|
"""
|
||||||
cmd = line.strip().lower()
|
cmd = line.strip().lower()
|
||||||
if cmd in {"/action", "/act"}:
|
if cmd in {"/action", "/act"}:
|
||||||
runtime.state["mode"] = "action"
|
runtime.state["mode"] = "action"
|
||||||
print("[smolvla2] mode: action — robot running", flush=True)
|
print("[smolvla2] mode: action — robot running", flush=True)
|
||||||
return True
|
return True
|
||||||
if cmd in {"/vlm", "/vqa"}:
|
if cmd in {"/question", "/q", "/vlm", "/vqa"}:
|
||||||
runtime.state["mode"] = "vlm"
|
runtime.state["mode"] = "question"
|
||||||
# Drop any queued chunk so no stale action fires while paused.
|
# Drop any queued chunk so no stale action fires while paused.
|
||||||
queue = runtime.state.get("action_queue")
|
queue = runtime.state.get("action_queue")
|
||||||
if hasattr(queue, "clear"):
|
if hasattr(queue, "clear"):
|
||||||
queue.clear()
|
queue.clear()
|
||||||
print(
|
print(
|
||||||
"[smolvla2] mode: vlm — action loop paused; type VQA questions",
|
"[smolvla2] mode: question — action loop paused; type VQA questions",
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
@@ -1010,8 +1032,8 @@ def _handle_slash_command(runtime: Any, line: str) -> bool:
|
|||||||
def _run_vqa_query(runtime: Any, question: str) -> None:
|
def _run_vqa_query(runtime: Any, question: str) -> None:
|
||||||
"""Run one interactive VQA question against the runtime's policy.
|
"""Run one interactive VQA question against the runtime's policy.
|
||||||
|
|
||||||
Used by both loops when in ``/vlm`` mode — the action loop is paused
|
Used by both loops when in ``/question`` mode — the action loop is
|
||||||
so the policy is free for a synchronous VQA call.
|
paused so the policy is free for a synchronous VQA call.
|
||||||
"""
|
"""
|
||||||
from lerobot.policies.smolvla2.inference.vqa import handle_vqa_query # noqa: PLC0415
|
from lerobot.policies.smolvla2.inference.vqa import handle_vqa_query # noqa: PLC0415
|
||||||
|
|
||||||
@@ -1089,7 +1111,7 @@ def _run_autonomous(
|
|||||||
redraw()
|
redraw()
|
||||||
print(
|
print(
|
||||||
" [autonomous] type interjections / '?' questions on stdin; "
|
" [autonomous] type interjections / '?' questions on stdin; "
|
||||||
"/vlm for VQA mode, /action to resume, /help for commands, "
|
"/question for VQA mode, /action to resume, /help for commands, "
|
||||||
"'stop' or Ctrl+C to quit",
|
"'stop' or Ctrl+C to quit",
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
@@ -1133,18 +1155,18 @@ def _run_autonomous(
|
|||||||
lower = line.lower()
|
lower = line.lower()
|
||||||
if lower in {"stop", "quit", "exit"}:
|
if lower in {"stop", "quit", "exit"}:
|
||||||
break
|
break
|
||||||
# Slash commands (/action, /vlm, /help) flip the run mode.
|
# Slash commands (/action, /question, /help) flip the run mode.
|
||||||
if _handle_slash_command(runtime, line):
|
if _handle_slash_command(runtime, line):
|
||||||
# Redraw once so the panel reflects the new mode. In
|
# Redraw once so the panel reflects the new mode. In
|
||||||
# ``/vlm`` the timer redraw is now suspended, so this is
|
# ``/question`` the timer redraw is now suspended, so
|
||||||
# the last clear — the VQA prompt below stays stable.
|
# this is the last clear — the VQA prompt stays stable.
|
||||||
try:
|
try:
|
||||||
redraw()
|
redraw()
|
||||||
except Exception: # noqa: BLE001
|
except Exception: # noqa: BLE001
|
||||||
pass
|
pass
|
||||||
if runtime.state.get("mode") == "vlm":
|
if runtime.state.get("mode") == "question":
|
||||||
print(
|
print(
|
||||||
" [vlm] type a VQA question and press Enter; "
|
" [question] type a VQA question and press Enter; "
|
||||||
"/action to resume the robot.",
|
"/action to resume the robot.",
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
@@ -1187,10 +1209,10 @@ def _run_autonomous(
|
|||||||
if not runtime.state.get("task"):
|
if not runtime.state.get("task"):
|
||||||
runtime.set_task(line)
|
runtime.set_task(line)
|
||||||
continue
|
continue
|
||||||
# ``/vlm`` mode: the whole line is a VQA question, handled
|
# ``/question`` mode: the whole line is a VQA question,
|
||||||
# synchronously (the action loop is paused so the policy is
|
# handled synchronously (the action loop is paused so the
|
||||||
# not in concurrent use by the background runtime thread).
|
# policy is not in concurrent use by the background thread).
|
||||||
if runtime.state.get("mode", "action") == "vlm":
|
if runtime.state.get("mode", "action") == "question":
|
||||||
_run_vqa_query(runtime, line)
|
_run_vqa_query(runtime, line)
|
||||||
continue
|
continue
|
||||||
if lower.endswith("?"):
|
if lower.endswith("?"):
|
||||||
@@ -1242,7 +1264,7 @@ def _make_state_panel_renderer(
|
|||||||
mode_tag = (
|
mode_tag = (
|
||||||
"[green]mode: action[/]"
|
"[green]mode: action[/]"
|
||||||
if run_mode == "action"
|
if run_mode == "action"
|
||||||
else "[yellow]mode: vlm (action loop paused)[/]"
|
else "[yellow]mode: question (action loop paused)[/]"
|
||||||
)
|
)
|
||||||
console.rule(
|
console.rule(
|
||||||
f"[bold]SmolVLA2[/] · {mode_label} · {mode_tag}", style="cyan"
|
f"[bold]SmolVLA2[/] · {mode_label} · {mode_tag}", style="cyan"
|
||||||
@@ -1252,7 +1274,7 @@ def _make_state_panel_renderer(
|
|||||||
# away under the timer redraw).
|
# away under the timer redraw).
|
||||||
if run_mode == "action":
|
if run_mode == "action":
|
||||||
console.print(
|
console.print(
|
||||||
" [dim]commands:[/] [bold]/vlm[/] ask a VQA question · "
|
" [dim]commands:[/] [bold]/question[/] ask a VQA question · "
|
||||||
"[bold]/help[/] all commands · [bold]stop[/] quit"
|
"[bold]/help[/] all commands · [bold]stop[/] quit"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -1335,7 +1357,7 @@ def _make_state_panel_renderer(
|
|||||||
console.print()
|
console.print()
|
||||||
if not st.get("task"):
|
if not st.get("task"):
|
||||||
console.print(
|
console.print(
|
||||||
" [dim]Type the task to begin. /vlm switches to VQA mode, "
|
" [dim]Type the task to begin. /question switches to VQA mode, "
|
||||||
"/action resumes the robot, /help lists commands. "
|
"/action resumes the robot, /help lists commands. "
|
||||||
"Type 'stop' to exit.[/]"
|
"Type 'stop' to exit.[/]"
|
||||||
)
|
)
|
||||||
@@ -1438,6 +1460,11 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Startup mode prompt — choose action (run the robot) vs question
|
||||||
|
# (VQA only) *before* the task picker, so the operator sets intent
|
||||||
|
# up front. It can still be flipped any time with /action /question.
|
||||||
|
startup_mode = _select_mode_interactively()
|
||||||
|
|
||||||
# Always offer the startup task picker on an interactive terminal:
|
# Always offer the startup task picker on an interactive terminal:
|
||||||
# list the dataset's tasks (the canonical / --task one shown as the
|
# list the dataset's tasks (the canonical / --task one shown as the
|
||||||
# default) so the operator can pick another or type a custom task.
|
# default) so the operator can pick another or type a custom task.
|
||||||
@@ -1518,6 +1545,8 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
runtime.state["text_gen_min_new_tokens"] = int(getattr(args, "text_min_new_tokens", 0) or 0)
|
runtime.state["text_gen_min_new_tokens"] = int(getattr(args, "text_min_new_tokens", 0) or 0)
|
||||||
runtime.state["text_gen_temperature"] = float(getattr(args, "text_temperature", 0.0) or 0.0)
|
runtime.state["text_gen_temperature"] = float(getattr(args, "text_temperature", 0.0) or 0.0)
|
||||||
runtime.state["text_gen_top_p"] = float(getattr(args, "text_top_p", 1.0) or 1.0)
|
runtime.state["text_gen_top_p"] = float(getattr(args, "text_top_p", 1.0) or 1.0)
|
||||||
|
# Apply the startup mode chosen above the task picker.
|
||||||
|
runtime.state["mode"] = startup_mode
|
||||||
if args.task:
|
if args.task:
|
||||||
runtime.set_task(args.task)
|
runtime.set_task(args.task)
|
||||||
# Seed the current subtask from the dataset so the first chunk —
|
# Seed the current subtask from the dataset so the first chunk —
|
||||||
@@ -1600,17 +1629,17 @@ def _run_repl(runtime: Any, *, initial_task: str | None, max_ticks: int | None)
|
|||||||
if lower in {"stop", "quit", "exit"}:
|
if lower in {"stop", "quit", "exit"}:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Slash commands (/action, /vlm, /help) flip the run mode.
|
# Slash commands (/action, /question, /help) flip the run mode.
|
||||||
if _handle_slash_command(runtime, line):
|
if _handle_slash_command(runtime, line):
|
||||||
_redraw(last_logs)
|
_redraw(last_logs)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# ``/vlm`` mode: a typed line (that isn't a task command) is
|
# ``/question`` mode: a typed line (that isn't a task
|
||||||
# a VQA question — run it synchronously and skip the action
|
# command) is a VQA question — run it synchronously and skip
|
||||||
# pipeline tick entirely.
|
# the action pipeline tick entirely.
|
||||||
if (
|
if (
|
||||||
runtime.state.get("task")
|
runtime.state.get("task")
|
||||||
and runtime.state.get("mode", "action") == "vlm"
|
and runtime.state.get("mode", "action") == "question"
|
||||||
and not lower.startswith(("task:", "rephrase:"))
|
and not lower.startswith(("task:", "rephrase:"))
|
||||||
):
|
):
|
||||||
runtime.state["log_lines"] = []
|
runtime.state["log_lines"] = []
|
||||||
|
|||||||
Reference in New Issue
Block a user