From 9cbbcfb6a2e4b638c35de384fcb16fed28c4b908 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 5 May 2026 11:40:18 +0200 Subject: [PATCH] fix(smolvla2): tokenize lang prompt inline before select_action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LowLevelForward was handing the observation provider's output straight to ``policy.select_action``, but SmolVLA's ``_get_action_chunk`` indexes ``batch[OBS_LANGUAGE_TOKENS]`` and crashes with ``KeyError: 'observation.language.tokens'`` when the key isn't there. Our provider deliberately strips the dataset's language columns (the runtime drives messages itself), so nothing else was producing those tokens — the chunk path crashed on the very first tick after task was set. Build a low-level prompt from current runtime state inline (task / plan / memory as the user turn, current subtask appended as a continuation assistant turn when known), tokenize it with the same helper the high-level steps use, and merge ``lang_tokens`` / ``lang_masks`` into the observation before the call. Skip the step when no task is set yet, and swallow ``select_action`` exceptions at debug level so a missing observation feature doesn't kill the REPL. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../policies/smolvla2/inference/steps.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index 0841359b0..8acb19d40 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -91,10 +91,35 @@ class LowLevelForward(InferenceStep): def run(self, state: dict[str, Any]) -> dict[str, Any] | None: if self.policy is None or self.observation_provider is None: return None + if not state.get("task"): + # No task yet → nothing useful to condition on. + return None observation = self.observation_provider() if observation is None: return None - action = self.policy.select_action(observation) + # SmolVLA's ``select_action`` expects the full preprocessed + # batch, including ``OBS_LANGUAGE_TOKENS`` / + # ``OBS_LANGUAGE_ATTENTION_MASK``. The observation provider + # only returns image / state features (the runtime drives + # messages itself), so build a low-level prompt from current + # runtime state and tokenize it inline. + ctx = _control_context_messages(state) + if state.get("current_subtask"): + ctx = ctx + [{"role": "assistant", "content": state["current_subtask"]}] + text_batch = _build_text_batch(self.policy, ctx) + from lerobot.utils.constants import ( # noqa: PLC0415 + OBS_LANGUAGE_ATTENTION_MASK, + OBS_LANGUAGE_TOKENS, + ) + + observation = dict(observation) + observation[OBS_LANGUAGE_TOKENS] = text_batch["lang_tokens"] + observation[OBS_LANGUAGE_ATTENTION_MASK] = text_batch["lang_masks"] + try: + action = self.policy.select_action(observation) + except Exception as exc: # noqa: BLE001 + logger.debug("select_action skipped: %s", exc) + return None # SmolVLA returns a single action; if the underlying policy # streams chunks, split per-step here. For v1 we just enqueue # the result.