feat(smolvla2): action-dispatch counter + tighter gibberish filter

Real-robot run was unreadable for two reasons:

1. The panel surfaced ``queued actions: 0`` (always zero — dispatch
   pops faster than chunk_hz generates) and gave no signal that
   actions were actually reaching the robot. The only sign of life
   was the safety-clamp warning lines scrolling past.

2. The text head consistently collapses to ``the`` / ``Ass``
   fragments on real-camera input (memorisation wall). The old
   gibberish filter caught ``":":":"`` JSON salad but let
   single-token fragments through, and the ``[info] subtask gen
   produced no text this tick`` line flooded the panel every second.

Changes:

  * ``DispatchAction`` bumps ``state["actions_dispatched"]`` each
    tick; panel renders it next to queue depth. Operator can see
    the policy IS issuing actions even when text is broken.
  * ``_looks_like_gibberish`` now also rejects:
    - too few unique alphabetic tokens (``the``, ``the the``, ...)
    - chat-template marker leakage (``Assistant:``, ``Ass\\n::``)
    catching the actual failure mode on real-robot frames.
  * Gibberish rejections log only the first occurrence + every 30th
    after that, with a count, so the panel stays legible.
  * Empty completions no longer log at all (was every tick).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-12 15:22:36 +02:00
parent d0278ea093
commit cbfaf2c544
2 changed files with 50 additions and 15 deletions
@@ -147,6 +147,11 @@ class DispatchAction(InferenceStep):
action = queue.popleft() if hasattr(queue, "popleft") else queue.pop(0) action = queue.popleft() if hasattr(queue, "popleft") else queue.pop(0)
if self.robot_executor is not None: if self.robot_executor is not None:
self.robot_executor(action) self.robot_executor(action)
# Track lifetime dispatch count so the REPL panel can show
# whether the action loop is actually doing useful work, even
# while the text head produces gibberish (the typical real-
# robot failure mode for a memorised model).
state["actions_dispatched"] = state.get("actions_dispatched", 0) + 1
return None return None
@@ -285,15 +290,25 @@ class HighLevelSubtaskFwd(InferenceStep):
self.policy, ctx, observation=observation, state=state, label="subtask gen" self.policy, ctx, observation=observation, state=state, label="subtask gen"
) )
if msg and _looks_like_gibberish(msg): if msg and _looks_like_gibberish(msg):
push_log(state, f" [info] subtask gen rejected (gibberish): {msg[:60]!r}") # Bump a counter so the operator can see the model is
# struggling without spamming the log every tick. A first
# rejection still logs once so the failure is visible.
count = state.get("subtask_gibberish_count", 0) + 1
state["subtask_gibberish_count"] = count
if count == 1 or count % 30 == 0:
push_log(
state,
f" [info] subtask gen rejected (gibberish ×{count}): {msg[:60]!r}",
)
return None return None
if msg: if msg:
changed = set_if_changed(state, "current_subtask", msg, label="subtask") changed = set_if_changed(state, "current_subtask", msg, label="subtask")
if changed: if changed:
# Subtask change is a downstream trigger. # Subtask change is a downstream trigger.
state.setdefault("events_this_tick", []).append("subtask_change") state.setdefault("events_this_tick", []).append("subtask_change")
else: # Silently skip empty completions — common when the model
push_log(state, " [info] subtask gen produced no text this tick") # warms up or generates only EOS; logging it every tick at
# ctrl_hz is just noise.
return None return None
@@ -357,7 +372,9 @@ class UserInterjectionFwd(InferenceStep):
self.policy, ctx, observation=observation, state=state, label="plan/say gen" self.policy, ctx, observation=observation, state=state, label="plan/say gen"
) )
if not out: if not out:
push_log(state, " [info] plan/say gen produced no text this tick") # Don't log every empty completion — happens repeatedly on
# MPS during warm-up and floods the panel. The user can
# re-trigger by typing again.
return None return None
if _looks_like_gibberish(out): if _looks_like_gibberish(out):
push_log(state, f" [info] plan/say gen rejected (gibberish): {out[:60]!r}") push_log(state, f" [info] plan/say gen rejected (gibberish): {out[:60]!r}")
@@ -462,20 +479,22 @@ class DispatchToolCalls(InferenceStep):
def _looks_like_gibberish(text: str) -> bool: def _looks_like_gibberish(text: str) -> bool:
"""Heuristically detect generation that's clearly off the rails. """Heuristically detect generation that's clearly off the rails.
Memorised models can collapse to dominant-mode outputs (often the Memorised models can collapse to dominant-mode outputs when the
JSON-token salad ``":":":":...`` from VQA training) when the prompt prompt drifts even slightly from training distribution. Reject:
drifts even slightly from training distribution. If we accept those
as new state, they pollute the next tick's prompt and cascade into
worse outputs. Reject anything that looks pathological:
* empty / whitespace-only * empty / whitespace-only
* mostly punctuation (``"``, ``:``, ``,``) * too few alphabetic characters (mostly punctuation)
* a single character repeated past the threshold * a single character repeated past the threshold
* starts with ``":"`` and contains no letters * starts with ``":"`` and contains no letters
* too few unique tokens — e.g. ``"the"``, ``"the the the"``,
``"Ass\\n::\\nthe"`` (the collapse seen on real-robot frames
where the model emits one or two memorised tokens repeatedly)
* chat-template fragment leakage (``Assistant:``, ``User:``,
``Ass\\n``)
The thresholds are intentionally lenient — a real subtask like Real subtasks look like ``"close the gripper to grasp the blue
``"close the gripper"`` has ~70%+ alpha characters, while gibberish cube"`` — multiple unique alphabetic tokens, no role-marker
like ``":":":"`` has ~0%. fragments. Anything materially shorter than that is rejected.
""" """
if not text or not text.strip(): if not text or not text.strip():
return True return True
@@ -485,9 +504,22 @@ def _looks_like_gibberish(text: str) -> bool:
return True return True
if stripped.startswith('":') and stripped.count('"') > stripped.count(" "): if stripped.startswith('":') and stripped.count('"') > stripped.count(" "):
return True return True
# Single repeating char: e.g. ``""""""`` # Single repeating char: e.g. ``""""""``.
if len(set(stripped)) <= 2 and len(stripped) > 4: if len(set(stripped)) <= 2 and len(stripped) > 4:
return True return True
# Chat-template fragment leakage — the model emits ``Ass``,
# ``Assistant:``, ``User:``, often with extra newlines/colons.
# Reject if the cleaned text is mostly role-marker shards.
cleaned = stripped.replace("\n", " ").replace(":", " ")
for marker in ("Assistant", "User", "Ass "):
if marker in cleaned and len(cleaned.split()) < 4:
return True
# Too few unique alphabetic tokens — model stuck on ``the`` or
# similar memorised single-token continuations.
tokens = [t for t in cleaned.split() if any(c.isalpha() for c in t)]
unique_alpha = {t.lower() for t in tokens}
if len(unique_alpha) < 3 and len(stripped) < 80:
return True
return False return False
@@ -842,8 +842,11 @@ def _make_state_panel_renderer(
else 0 else 0
) )
pending = len(st.get("tool_calls_pending") or []) pending = len(st.get("tool_calls_pending") or [])
dispatched = int(st.get("actions_dispatched") or 0)
console.print( console.print(
f" [dim]queued actions: {queue_len} pending tool calls: {pending}[/]" f" [dim]queued actions: {queue_len} "
f"dispatched: {dispatched} "
f"pending tool calls: {pending}[/]"
) )
console.rule(style="cyan") console.rule(style="cyan")
if robot_lines: if robot_lines: