From db9118f16fc92d8ab8d7dc673df7a897b2c2ca0e Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 5 May 2026 14:07:25 +0200 Subject: [PATCH] fix(smolvla2): reject gibberish high-level generations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Memorised models can collapse to dominant-mode outputs (the JSON-token salad ``":":":":...`` from VQA training) when the prompt drifts even slightly from training distribution. Without a guard, that gibberish lands in ``current_subtask`` / ``current_plan`` / ``current_memory``, which feeds the next tick's prompt and cascades into worse outputs. The user observed exactly this: a clean run followed by a tick that wrote ``" " "`` into plan and memory, then slow recovery several ticks later. Add ``_looks_like_gibberish`` heuristic (alpha density, repeating chars, JSON-prefix sniff) and apply it before mutating state in ``HighLevelSubtaskFwd`` / ``MemoryUpdateFwd`` / ``UserInterjectionFwd``. Bad generations are logged inline (``[info] subtask gen rejected (gibberish): "":":":..."``) so the user can see what was dropped, but the state stays at its last-known-good value (typically the dataset bootstrap) instead of being polluted. VQA path is intentionally exempt — its training targets *are* JSON-shaped, so the heuristic would false-positive on them. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../policies/smolvla2/inference/steps.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index d30bc9dc9..3db87bdb7 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -271,6 +271,9 @@ class HighLevelSubtaskFwd(InferenceStep): msg = _generate_with_policy( self.policy, ctx, observation=observation, state=state, label="subtask gen" ) + if msg and _looks_like_gibberish(msg): + push_log(state, f" [info] subtask gen rejected (gibberish): {msg[:60]!r}") + return None if msg: changed = set_if_changed(state, "current_subtask", msg, label="subtask") if changed: @@ -307,6 +310,9 @@ class MemoryUpdateFwd(InferenceStep): new_memory = _generate_with_policy( self.policy, ctx, observation=observation, state=state, label="memory gen" ) + if new_memory and _looks_like_gibberish(new_memory): + push_log(state, f" [info] memory gen rejected (gibberish): {new_memory[:60]!r}") + return None if new_memory: set_if_changed(state, "current_memory", new_memory, label="memory") return None @@ -340,11 +346,16 @@ class UserInterjectionFwd(InferenceStep): if not out: push_log(state, " [info] plan/say gen produced no text this tick") return None + if _looks_like_gibberish(out): + push_log(state, f" [info] plan/say gen rejected (gibberish): {out[:60]!r}") + return None # Heuristic split: model is trained to emit one assistant turn # carrying both plan text AND a `say` tool call. Look for a # "..." or "say(...)" marker; fall back to whole # text → plan, no speech. plan_text, speech_text = _split_plan_and_say(out) + if plan_text and _looks_like_gibberish(plan_text): + plan_text = "" if plan_text: set_if_changed(state, "current_plan", plan_text, label="plan") if speech_text: @@ -390,6 +401,9 @@ class AskVQAFwd(InferenceStep): answer = _generate_with_policy( self.policy, ctx, observation=observation, state=state, label="vqa gen" ) + # VQA answers are intentionally JSON-like during training, so + # ``_looks_like_gibberish`` would false-positive on them. Keep + # the answer as-is — the VQA panel line lets the user judge. if answer: push_log(state, f" vqa: {answer}") state["recent_vqa_query"] = None @@ -432,6 +446,38 @@ class DispatchToolCalls(InferenceStep): # --------------------------------------------------------------------------- +def _looks_like_gibberish(text: str) -> bool: + """Heuristically detect generation that's clearly off the rails. + + Memorised models can collapse to dominant-mode outputs (often the + JSON-token salad ``":":":":...`` from VQA training) when the prompt + drifts even slightly from training distribution. If we accept those + as new state, they pollute the next tick's prompt and cascade into + worse outputs. Reject anything that looks pathological: + + * empty / whitespace-only + * mostly punctuation (``"``, ``:``, ``,``) + * a single character repeated past the threshold + * starts with ``":"`` and contains no letters + + The thresholds are intentionally lenient — a real subtask like + ``"close the gripper"`` has ~70%+ alpha characters, while gibberish + like ``":":":"`` has ~0%. + """ + if not text or not text.strip(): + return True + stripped = text.strip() + alpha = sum(1 for c in stripped if c.isalpha()) + if alpha < max(3, len(stripped) // 8): + return True + if stripped.startswith('":') and stripped.count('"') > stripped.count(" "): + return True + # Single repeating char: e.g. ``""""""`` + if len(set(stripped)) <= 2 and len(stripped) > 4: + return True + return False + + def _control_context_messages( state: dict[str, Any], *,