fix(annotate): mm_processor_kwargs in extra_body; inline file URLs as data URLs

Two fixes for video_url with transformers serve:
- fps must be in extra_body.mm_processor_kwargs, not in the content
  block; otherwise the server discards it as unknown kwargs.
- file:// URLs aren't fetched by transformers serve. Read the local mp4
  and inline it as a base64 data:video/mp4 URL so the server sees the
  bytes directly.

Both surface as std::bad_alloc on the server side when wrong, which is
unhelpful but explains what we hit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-04-28 18:53:43 +02:00
parent 43d3ba1d4e
commit 8807e0b41e
@@ -296,12 +296,19 @@ def _make_openai_client(config: VlmConfig) -> VlmClient:
) -> list[str]: ) -> list[str]:
outs: list[str] = [] outs: list[str] = []
for messages in batch: for messages in batch:
api_messages = [_to_openai_message(m) for m in messages] api_messages, mm_kwargs = _to_openai_messages(messages)
extra_body: dict[str, Any] = {}
if mm_kwargs:
extra_body["mm_processor_kwargs"] = {
**mm_kwargs,
"do_sample_frames": True,
}
response = client.chat.completions.create( response = client.chat.completions.create(
model=config.model_id, model=config.model_id,
messages=api_messages, messages=api_messages,
max_tokens=max_tok, max_tokens=max_tok,
temperature=temp, temperature=temp,
extra_body=extra_body or None,
) )
outs.append(response.choices[0].message.content or "") outs.append(response.choices[0].message.content or "")
return outs return outs
@@ -400,15 +407,25 @@ def _spawn_inference_server(config: VlmConfig) -> str:
) )
def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]: def _to_openai_messages(
"""Convert an internal message dict to OpenAI chat format. messages: Sequence[dict[str, Any]],
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
"""Convert internal messages to OpenAI chat format.
Internal image/video blocks (using PIL.Image objects) become Returns ``(api_messages, mm_kwargs)``. Multimodal-processor kwargs
OpenAI ``image_url``/``video_url`` items via base64 data URLs. (``fps`` from ``video_url`` blocks) are extracted out so the caller
can pass them via ``extra_body.mm_processor_kwargs`` rather than
inside the content blocks (which transformers serve rejects).
File-URL video blocks are inlined as base64 data URLs.
""" """
out_messages: list[dict[str, Any]] = []
mm_kwargs: dict[str, Any] = {}
for message in messages:
content = message.get("content") content = message.get("content")
if not isinstance(content, list): if not isinstance(content, list):
return {"role": message["role"], "content": content} out_messages.append({"role": message["role"], "content": content})
continue
out_blocks: list[dict[str, Any]] = [] out_blocks: list[dict[str, Any]] = []
for block in content: for block in content:
block_type = block.get("type") if isinstance(block, dict) else None block_type = block.get("type") if isinstance(block, dict) else None
@@ -425,11 +442,27 @@ def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]:
{"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}} {"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}}
) )
elif block_type == "video_url": elif block_type == "video_url":
# Pass through to the OpenAI-compatible server unchanged. video_url = dict(block["video_url"])
out_blocks.append({"type": "video_url", "video_url": block["video_url"]}) url = video_url.get("url", "")
if url.startswith("file://"):
video_url["url"] = _file_to_data_url(url[len("file://") :])
out_blocks.append({"type": "video_url", "video_url": video_url})
fps = block.get("fps")
if fps is not None:
mm_kwargs["fps"] = fps
else: else:
out_blocks.append(block) out_blocks.append(block)
return {"role": message["role"], "content": out_blocks} out_messages.append({"role": message["role"], "content": out_blocks})
return out_messages, mm_kwargs
def _file_to_data_url(path: str) -> str:
"""Read a local video file and return a base64 ``data:video/mp4`` URL."""
import base64 # noqa: PLC0415
with open(path, "rb") as f:
b64 = base64.b64encode(f.read()).decode("ascii")
return f"data:video/mp4;base64,{b64}"
def _pil_to_data_url(image: Any) -> str: def _pil_to_data_url(image: Any) -> str: