feat(annotate): openai-compatible backend for transformers/ktransformers serve

Adds a third backend that talks to any OpenAI-compatible server. This
unblocks Qwen3.6 (and other models) that work in transformers serve /
ktransformers but not in vllm 0.10.2's fallback path:

- launch the server out-of-process (transformers serve, vllm serve,
  ktransformers serve)
- point lerobot-annotate at it via --vlm.backend=openai
  --vlm.api_base=http://localhost:8000/v1 --vlm.model_id=...

Image and video blocks are converted to OpenAI image_url/video_url
data URLs automatically.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-04-28 16:24:11 +02:00
parent 70bdec72ef
commit a27972125b
2 changed files with 91 additions and 1 deletions
@@ -61,8 +61,16 @@ class VlmConfig:
"""Shared Qwen-VL client configuration."""
backend: str = "vllm"
"""One of ``vllm``, ``transformers``, or ``stub`` (tests only)."""
"""One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests only).
The ``openai`` backend talks to any OpenAI-compatible server — works
with ``vllm serve``, ``transformers serve``, ``ktransformers serve``,
or hosted endpoints. Set ``api_base`` and (optionally) ``api_key``."""
model_id: str = "Qwen/Qwen3.6-27B-FP8"
api_base: str = "http://localhost:8000/v1"
"""Base URL for the ``openai`` backend."""
api_key: str = "EMPTY"
"""API key for the ``openai`` backend; ``EMPTY`` works for local servers."""
max_new_tokens: int = 512
temperature: float = 0.2
json_mode: bool = True
@@ -138,6 +138,8 @@ def make_vlm_client(config: VlmConfig) -> VlmClient:
return _make_vllm_client(config)
if config.backend == "transformers":
return _make_transformers_client(config)
if config.backend == "openai":
return _make_openai_client(config)
raise ValueError(f"Unknown VLM backend: {config.backend!r}")
@@ -251,6 +253,86 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient:
return _GenericTextClient(_gen, config)
def _make_openai_client(config: VlmConfig) -> VlmClient:
"""Backend that talks to any OpenAI-compatible server.
Compatible with ``vllm serve``, ``transformers serve``,
``ktransformers serve``, and hosted endpoints. The server is
expected to be already running and to host ``config.model_id``.
Image blocks ``{"type":"image", "image":<PIL.Image>}`` are
auto-converted to ``image_url`` data-URLs. Video blocks
``{"type":"video", "video":[<PIL>...]}`` are forwarded as
multi-frame ``video_url`` items where supported.
"""
try:
from openai import OpenAI # type: ignore[import-not-found]
except ImportError as exc:
raise ImportError(
"openai package is required for backend='openai'. "
"Install with `pip install openai`."
) from exc
client = OpenAI(base_url=config.api_base, api_key=config.api_key)
def _gen(
batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float
) -> list[str]:
outs: list[str] = []
for messages in batch:
api_messages = [_to_openai_message(m) for m in messages]
response = client.chat.completions.create(
model=config.model_id,
messages=api_messages,
max_tokens=max_tok,
temperature=temp,
)
outs.append(response.choices[0].message.content or "")
return outs
return _GenericTextClient(_gen, config)
def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]:
"""Convert an internal message dict to OpenAI chat format.
Internal image/video blocks (using PIL.Image objects) become
OpenAI ``image_url``/``video_url`` items via base64 data URLs.
"""
content = message.get("content")
if not isinstance(content, list):
return {"role": message["role"], "content": content}
out_blocks: list[dict[str, Any]] = []
for block in content:
block_type = block.get("type") if isinstance(block, dict) else None
if block_type == "text":
out_blocks.append({"type": "text", "text": block.get("text", "")})
elif block_type == "image":
out_blocks.append(
{"type": "image_url", "image_url": {"url": _pil_to_data_url(block["image"])}}
)
elif block_type == "video":
frames = block.get("video", [])
for img in frames:
out_blocks.append(
{"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}}
)
else:
out_blocks.append(block)
return {"role": message["role"], "content": out_blocks}
def _pil_to_data_url(image: Any) -> str:
"""Encode a PIL.Image as a base64 data URL."""
import base64 # noqa: PLC0415
import io # noqa: PLC0415
buf = io.BytesIO()
image.save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
return f"data:image/png;base64,{b64}"
def _messages_to_prompt(messages: Sequence[dict[str, Any]]) -> Any:
"""Pass-through hook used by the vllm backend.