From f1a0a663cc2960acf01b89b6d2d167608d4d2d04 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Fri, 15 May 2026 13:52:26 +0200
Subject: [PATCH] fix(inference): gibberish detector catches long repetition
 collapse
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ``_looks_like_gibberish`` low-unique-token check was gated on
``len(stripped) < 80``, so an LM head that loops an n-gram for the
whole 256-token budget — "the arm the arm … the the the the" —
sailed straight through (``gibberish:0`` in the panel) and the
garbage subtask got accepted and fed to the action expert.

Added a length-independent check: ``>= 8 tokens`` but unique-token
count ``<= max(3, tokens // 10)`` ⇒ repetition collapse. Now the
runtime rejects the looped output and keeps the previous (real)
subtask instead of propagating nonsense.

This is a guard, not a cure — the underlying issue is the LM head
on the current checkpoint being undertrained / collapsed; re-
annotate with the short prompts and train longer.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/lerobot/policies/smolvla2/inference/steps.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py
index c9b84b167..a36ae26b5 100644
--- a/src/lerobot/policies/smolvla2/inference/steps.py
+++ b/src/lerobot/policies/smolvla2/inference/steps.py
@@ -683,12 +683,19 @@ def _looks_like_gibberish(text: str) -> bool:
     for marker in ("Assistant", "User", "Ass "):
         if marker in cleaned and len(cleaned.split()) < 4:
             return True
-    # Too few unique alphabetic tokens — model stuck on ``the`` or
-    # similar memorised single-token continuations.
     tokens = [t for t in cleaned.split() if any(c.isalpha() for c in t)]
     unique_alpha = {t.lower() for t in tokens}
+    # Short degenerate output — model stuck on ``the`` or a couple of
+    # memorised single-token continuations.
     if len(unique_alpha) < 3 and len(stripped) < 80:
         return True
+    # Long repetition collapse — the LM head loops an n-gram for the
+    # whole generation budget ("the arm the arm … the the the the").
+    # Length-independent: many tokens but a tiny unique ratio. The
+    # earlier ``< 80`` check missed these because the looped string
+    # blows well past 80 chars.
+    if len(tokens) >= 8 and len(unique_alpha) <= max(3, len(tokens) // 10):
+        return True
     return False