mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-15 08:39:49 +00:00
fix(annotate): use vllm.chat() API for multimodal prompts
vllm.generate() expects a string/TextPrompt; passing message dicts fails. vllm.chat() applies the chat template and extracts image/video blocks automatically, which is what we need for VL models. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -173,8 +173,10 @@ def _make_vllm_client(config: VlmConfig) -> VlmClient:
|
||||
# vllm releases (dict vs GuidedDecodingParams). The _GenericTextClient
|
||||
# wrapper already has a one-retry JSON-recovery path, so we skip it.
|
||||
params = SamplingParams(max_tokens=max_tok, temperature=temp)
|
||||
prompts = [_messages_to_prompt(m) for m in batch]
|
||||
outputs = llm.generate(prompts, params)
|
||||
# ``llm.chat`` handles chat-template application + multimodal input
|
||||
# extraction (image/video blocks) internally, which ``llm.generate``
|
||||
# does not.
|
||||
outputs = llm.chat([list(m) for m in batch], params)
|
||||
return [o.outputs[0].text for o in outputs]
|
||||
|
||||
return _GenericTextClient(_gen, config)
|
||||
|
||||
Reference in New Issue
Block a user