mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-24 21:19:53 +00:00
feat(smolvla2): action-dispatch counter + tighter gibberish filter
Real-robot run was unreadable for two reasons:
1. The panel surfaced ``queued actions: 0`` (always zero — dispatch
pops faster than chunk_hz generates) and gave no signal that
actions were actually reaching the robot. The only sign of life
was the safety-clamp warning lines scrolling past.
2. The text head consistently collapses to ``the`` / ``Ass``
fragments on real-camera input (memorisation wall). The old
gibberish filter caught ``":":":"`` JSON salad but let
single-token fragments through, and the ``[info] subtask gen
produced no text this tick`` line flooded the panel every second.
Changes:
* ``DispatchAction`` bumps ``state["actions_dispatched"]`` each
tick; panel renders it next to queue depth. Operator can see
the policy IS issuing actions even when text is broken.
* ``_looks_like_gibberish`` now also rejects:
- too few unique alphabetic tokens (``the``, ``the the``, ...)
- chat-template marker leakage (``Assistant:``, ``Ass\\n::``)
catching the actual failure mode on real-robot frames.
* Gibberish rejections log only the first occurrence + every 30th
after that, with a count, so the panel stays legible.
* Empty completions no longer log at all (was every tick).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -147,6 +147,11 @@ class DispatchAction(InferenceStep):
|
||||
action = queue.popleft() if hasattr(queue, "popleft") else queue.pop(0)
|
||||
if self.robot_executor is not None:
|
||||
self.robot_executor(action)
|
||||
# Track lifetime dispatch count so the REPL panel can show
|
||||
# whether the action loop is actually doing useful work, even
|
||||
# while the text head produces gibberish (the typical real-
|
||||
# robot failure mode for a memorised model).
|
||||
state["actions_dispatched"] = state.get("actions_dispatched", 0) + 1
|
||||
return None
|
||||
|
||||
|
||||
@@ -285,15 +290,25 @@ class HighLevelSubtaskFwd(InferenceStep):
|
||||
self.policy, ctx, observation=observation, state=state, label="subtask gen"
|
||||
)
|
||||
if msg and _looks_like_gibberish(msg):
|
||||
push_log(state, f" [info] subtask gen rejected (gibberish): {msg[:60]!r}")
|
||||
# Bump a counter so the operator can see the model is
|
||||
# struggling without spamming the log every tick. A first
|
||||
# rejection still logs once so the failure is visible.
|
||||
count = state.get("subtask_gibberish_count", 0) + 1
|
||||
state["subtask_gibberish_count"] = count
|
||||
if count == 1 or count % 30 == 0:
|
||||
push_log(
|
||||
state,
|
||||
f" [info] subtask gen rejected (gibberish ×{count}): {msg[:60]!r}",
|
||||
)
|
||||
return None
|
||||
if msg:
|
||||
changed = set_if_changed(state, "current_subtask", msg, label="subtask")
|
||||
if changed:
|
||||
# Subtask change is a downstream trigger.
|
||||
state.setdefault("events_this_tick", []).append("subtask_change")
|
||||
else:
|
||||
push_log(state, " [info] subtask gen produced no text this tick")
|
||||
# Silently skip empty completions — common when the model
|
||||
# warms up or generates only EOS; logging it every tick at
|
||||
# ctrl_hz is just noise.
|
||||
return None
|
||||
|
||||
|
||||
@@ -357,7 +372,9 @@ class UserInterjectionFwd(InferenceStep):
|
||||
self.policy, ctx, observation=observation, state=state, label="plan/say gen"
|
||||
)
|
||||
if not out:
|
||||
push_log(state, " [info] plan/say gen produced no text this tick")
|
||||
# Don't log every empty completion — happens repeatedly on
|
||||
# MPS during warm-up and floods the panel. The user can
|
||||
# re-trigger by typing again.
|
||||
return None
|
||||
if _looks_like_gibberish(out):
|
||||
push_log(state, f" [info] plan/say gen rejected (gibberish): {out[:60]!r}")
|
||||
@@ -462,20 +479,22 @@ class DispatchToolCalls(InferenceStep):
|
||||
def _looks_like_gibberish(text: str) -> bool:
|
||||
"""Heuristically detect generation that's clearly off the rails.
|
||||
|
||||
Memorised models can collapse to dominant-mode outputs (often the
|
||||
JSON-token salad ``":":":":...`` from VQA training) when the prompt
|
||||
drifts even slightly from training distribution. If we accept those
|
||||
as new state, they pollute the next tick's prompt and cascade into
|
||||
worse outputs. Reject anything that looks pathological:
|
||||
Memorised models can collapse to dominant-mode outputs when the
|
||||
prompt drifts even slightly from training distribution. Reject:
|
||||
|
||||
* empty / whitespace-only
|
||||
* mostly punctuation (``"``, ``:``, ``,``)
|
||||
* too few alphabetic characters (mostly punctuation)
|
||||
* a single character repeated past the threshold
|
||||
* starts with ``":"`` and contains no letters
|
||||
* too few unique tokens — e.g. ``"the"``, ``"the the the"``,
|
||||
``"Ass\\n::\\nthe"`` (the collapse seen on real-robot frames
|
||||
where the model emits one or two memorised tokens repeatedly)
|
||||
* chat-template fragment leakage (``Assistant:``, ``User:``,
|
||||
``Ass\\n``)
|
||||
|
||||
The thresholds are intentionally lenient — a real subtask like
|
||||
``"close the gripper"`` has ~70%+ alpha characters, while gibberish
|
||||
like ``":":":"`` has ~0%.
|
||||
Real subtasks look like ``"close the gripper to grasp the blue
|
||||
cube"`` — multiple unique alphabetic tokens, no role-marker
|
||||
fragments. Anything materially shorter than that is rejected.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return True
|
||||
@@ -485,9 +504,22 @@ def _looks_like_gibberish(text: str) -> bool:
|
||||
return True
|
||||
if stripped.startswith('":') and stripped.count('"') > stripped.count(" "):
|
||||
return True
|
||||
# Single repeating char: e.g. ``""""""``
|
||||
# Single repeating char: e.g. ``""""""``.
|
||||
if len(set(stripped)) <= 2 and len(stripped) > 4:
|
||||
return True
|
||||
# Chat-template fragment leakage — the model emits ``Ass``,
|
||||
# ``Assistant:``, ``User:``, often with extra newlines/colons.
|
||||
# Reject if the cleaned text is mostly role-marker shards.
|
||||
cleaned = stripped.replace("\n", " ").replace(":", " ")
|
||||
for marker in ("Assistant", "User", "Ass "):
|
||||
if marker in cleaned and len(cleaned.split()) < 4:
|
||||
return True
|
||||
# Too few unique alphabetic tokens — model stuck on ``the`` or
|
||||
# similar memorised single-token continuations.
|
||||
tokens = [t for t in cleaned.split() if any(c.isalpha() for c in t)]
|
||||
unique_alpha = {t.lower() for t in tokens}
|
||||
if len(unique_alpha) < 3 and len(stripped) < 80:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
||||
@@ -842,8 +842,11 @@ def _make_state_panel_renderer(
|
||||
else 0
|
||||
)
|
||||
pending = len(st.get("tool_calls_pending") or [])
|
||||
dispatched = int(st.get("actions_dispatched") or 0)
|
||||
console.print(
|
||||
f" [dim]queued actions: {queue_len} pending tool calls: {pending}[/]"
|
||||
f" [dim]queued actions: {queue_len} "
|
||||
f"dispatched: {dispatched} "
|
||||
f"pending tool calls: {pending}[/]"
|
||||
)
|
||||
console.rule(style="cyan")
|
||||
if robot_lines:
|
||||
|
||||
Reference in New Issue
Block a user