mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 11:09:59 +00:00
fix(annotate): mm_processor_kwargs in extra_body; inline file URLs as data URLs
Two fixes for video_url with transformers serve: - fps must be in extra_body.mm_processor_kwargs, not in the content block; otherwise the server discards it as unknown kwargs. - file:// URLs aren't fetched by transformers serve. Read the local mp4 and inline it as a base64 data:video/mp4 URL so the server sees the bytes directly. Both surface as std::bad_alloc on the server side when wrong, which is unhelpful but explains what we hit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -296,12 +296,19 @@ def _make_openai_client(config: VlmConfig) -> VlmClient:
|
|||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
outs: list[str] = []
|
outs: list[str] = []
|
||||||
for messages in batch:
|
for messages in batch:
|
||||||
api_messages = [_to_openai_message(m) for m in messages]
|
api_messages, mm_kwargs = _to_openai_messages(messages)
|
||||||
|
extra_body: dict[str, Any] = {}
|
||||||
|
if mm_kwargs:
|
||||||
|
extra_body["mm_processor_kwargs"] = {
|
||||||
|
**mm_kwargs,
|
||||||
|
"do_sample_frames": True,
|
||||||
|
}
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=config.model_id,
|
model=config.model_id,
|
||||||
messages=api_messages,
|
messages=api_messages,
|
||||||
max_tokens=max_tok,
|
max_tokens=max_tok,
|
||||||
temperature=temp,
|
temperature=temp,
|
||||||
|
extra_body=extra_body or None,
|
||||||
)
|
)
|
||||||
outs.append(response.choices[0].message.content or "")
|
outs.append(response.choices[0].message.content or "")
|
||||||
return outs
|
return outs
|
||||||
@@ -400,36 +407,62 @@ def _spawn_inference_server(config: VlmConfig) -> str:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]:
|
def _to_openai_messages(
|
||||||
"""Convert an internal message dict to OpenAI chat format.
|
messages: Sequence[dict[str, Any]],
|
||||||
|
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
||||||
|
"""Convert internal messages to OpenAI chat format.
|
||||||
|
|
||||||
Internal image/video blocks (using PIL.Image objects) become
|
Returns ``(api_messages, mm_kwargs)``. Multimodal-processor kwargs
|
||||||
OpenAI ``image_url``/``video_url`` items via base64 data URLs.
|
(``fps`` from ``video_url`` blocks) are extracted out so the caller
|
||||||
|
can pass them via ``extra_body.mm_processor_kwargs`` rather than
|
||||||
|
inside the content blocks (which transformers serve rejects).
|
||||||
|
|
||||||
|
File-URL video blocks are inlined as base64 data URLs.
|
||||||
"""
|
"""
|
||||||
content = message.get("content")
|
out_messages: list[dict[str, Any]] = []
|
||||||
if not isinstance(content, list):
|
mm_kwargs: dict[str, Any] = {}
|
||||||
return {"role": message["role"], "content": content}
|
for message in messages:
|
||||||
out_blocks: list[dict[str, Any]] = []
|
content = message.get("content")
|
||||||
for block in content:
|
if not isinstance(content, list):
|
||||||
block_type = block.get("type") if isinstance(block, dict) else None
|
out_messages.append({"role": message["role"], "content": content})
|
||||||
if block_type == "text":
|
continue
|
||||||
out_blocks.append({"type": "text", "text": block.get("text", "")})
|
out_blocks: list[dict[str, Any]] = []
|
||||||
elif block_type == "image":
|
for block in content:
|
||||||
out_blocks.append(
|
block_type = block.get("type") if isinstance(block, dict) else None
|
||||||
{"type": "image_url", "image_url": {"url": _pil_to_data_url(block["image"])}}
|
if block_type == "text":
|
||||||
)
|
out_blocks.append({"type": "text", "text": block.get("text", "")})
|
||||||
elif block_type == "video":
|
elif block_type == "image":
|
||||||
frames = block.get("video", [])
|
|
||||||
for img in frames:
|
|
||||||
out_blocks.append(
|
out_blocks.append(
|
||||||
{"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}}
|
{"type": "image_url", "image_url": {"url": _pil_to_data_url(block["image"])}}
|
||||||
)
|
)
|
||||||
elif block_type == "video_url":
|
elif block_type == "video":
|
||||||
# Pass through to the OpenAI-compatible server unchanged.
|
frames = block.get("video", [])
|
||||||
out_blocks.append({"type": "video_url", "video_url": block["video_url"]})
|
for img in frames:
|
||||||
else:
|
out_blocks.append(
|
||||||
out_blocks.append(block)
|
{"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}}
|
||||||
return {"role": message["role"], "content": out_blocks}
|
)
|
||||||
|
elif block_type == "video_url":
|
||||||
|
video_url = dict(block["video_url"])
|
||||||
|
url = video_url.get("url", "")
|
||||||
|
if url.startswith("file://"):
|
||||||
|
video_url["url"] = _file_to_data_url(url[len("file://") :])
|
||||||
|
out_blocks.append({"type": "video_url", "video_url": video_url})
|
||||||
|
fps = block.get("fps")
|
||||||
|
if fps is not None:
|
||||||
|
mm_kwargs["fps"] = fps
|
||||||
|
else:
|
||||||
|
out_blocks.append(block)
|
||||||
|
out_messages.append({"role": message["role"], "content": out_blocks})
|
||||||
|
return out_messages, mm_kwargs
|
||||||
|
|
||||||
|
|
||||||
|
def _file_to_data_url(path: str) -> str:
|
||||||
|
"""Read a local video file and return a base64 ``data:video/mp4`` URL."""
|
||||||
|
import base64 # noqa: PLC0415
|
||||||
|
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
b64 = base64.b64encode(f.read()).decode("ascii")
|
||||||
|
return f"data:video/mp4;base64,{b64}"
|
||||||
|
|
||||||
|
|
||||||
def _pil_to_data_url(image: Any) -> str:
|
def _pil_to_data_url(image: Any) -> str:
|
||||||
|
|||||||
Reference in New Issue
Block a user