From fc715db4a31291c610e70cd93524746d60b38ee8 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 12 May 2026 15:01:53 +0200 Subject: [PATCH] fix(smolvla2): coerce str content to list-of-blocks for chat template MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SmolVLM's chat template (and many other multimodal templates) declares ``message['content']`` as a list of typed blocks and iterates it expecting dicts with a ``'type'`` field: {% for line in message['content'] %} {% if line['type'] == 'text' %}{{ line['text'] }} {% elif line['type'] == 'image' %}{{ '' }} {% endif %} {% endfor %} When the caller passes ``content`` as a plain ``str`` (which we did throughout ``_msgs_for_subtask`` / ``_msgs_for_memory`` etc.), Jinja silently iterates the string character-by-character. ``'P'['type']`` returns nothing; neither branch fires; *no text tokens get emitted*. The model receives a prompt containing only role markers (``User:\nAssistant:``) and predictably continues by emitting ``Assistant:`` fragments — the gibberish ``subtask: Ass\n::`` on the runtime panel. Before calling ``apply_chat_template``, walk the messages and rewrite any string ``content`` into ``[{'type': 'text', 'text': content}]``. The template's text branch then fires correctly and the model sees the actual user/assistant text, not just structural tokens. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lerobot/policies/smolvla2/inference/steps.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index 3db87bdb7..12fb40c4c 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -171,6 +171,17 @@ def _build_text_batch(policy: Any, prompt_messages: list[dict[str, Any]]) -> dic tokenizer.pad_token = tokenizer.eos_token text_messages = [_strip_recipe_keys(m) for m in prompt_messages] + # SmolVLM's chat template iterates ``message['content']`` expecting + # a list of typed blocks (``[{type: 'text', text: ...}, ...]``). + # When ``content`` is a plain ``str`` it silently iterates characters, + # no branch matches, and *no content tokens are emitted* — the model + # receives only role markers and starts hallucinating ``Assistant:`` + # fragments. Coerce string content to the list-of-blocks form the + # template expects. + for _m in text_messages: + _c = _m.get("content") + if isinstance(_c, str): + _m["content"] = [{"type": "text", "text": _c}] encoded = tokenizer.apply_chat_template( text_messages, add_generation_prompt=True,