From c5df821a9685d1b3937436f8afe57858ed2fdeeb Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 28 Apr 2026 16:20:50 +0200 Subject: [PATCH] fix(annotate): use vllm.chat() API for multimodal prompts vllm.generate() expects a string/TextPrompt; passing message dicts fails. vllm.chat() applies the chat template and extracts image/video blocks automatically, which is what we need for VL models. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lerobot/annotations/steerable_pipeline/vlm_client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index a7828c65d..ff0d07e5e 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -173,8 +173,10 @@ def _make_vllm_client(config: VlmConfig) -> VlmClient: # vllm releases (dict vs GuidedDecodingParams). The _GenericTextClient # wrapper already has a one-retry JSON-recovery path, so we skip it. params = SamplingParams(max_tokens=max_tok, temperature=temp) - prompts = [_messages_to_prompt(m) for m in batch] - outputs = llm.generate(prompts, params) + # ``llm.chat`` handles chat-template application + multimodal input + # extraction (image/video blocks) internally, which ``llm.generate`` + # does not. + outputs = llm.chat([list(m) for m in batch], params) return [o.outputs[0].text for o in outputs] return _GenericTextClient(_gen, config)