feat(annotate): openai-compatible backend for transformers/ktransformers serve

Adds a third backend that talks to any OpenAI-compatible server. This unblocks Qwen3.6 (and other models) that work in transformers serve / ktransformers but not in vllm 0.10.2's fallback path: - launch the server out-of-process (transformers serve, vllm serve, ktransformers serve) - point lerobot-annotate at it via --vlm.backend=openai --vlm.api_base=http://localhost:8000/v1 --vlm.model_id=... Image and video blocks are converted to OpenAI image_url/video_url data URLs automatically. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 08:39:49 +00:00 · 2026-04-28 16:24:11 +02:00
parent 70bdec72ef
commit a27972125b
2 changed files with 91 additions and 1 deletions
@@ -61,8 +61,16 @@ class VlmConfig:
    """Shared Qwen-VL client configuration."""

    backend: str = "vllm"
-    """One of ``vllm``, ``transformers``, or ``stub`` (tests only)."""
+    """One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests only).
+
+    The ``openai`` backend talks to any OpenAI-compatible server — works
+    with ``vllm serve``, ``transformers serve``, ``ktransformers serve``,
+    or hosted endpoints. Set ``api_base`` and (optionally) ``api_key``."""
    model_id: str = "Qwen/Qwen3.6-27B-FP8"
+    api_base: str = "http://localhost:8000/v1"
+    """Base URL for the ``openai`` backend."""
+    api_key: str = "EMPTY"
+    """API key for the ``openai`` backend; ``EMPTY`` works for local servers."""
    max_new_tokens: int = 512
    temperature: float = 0.2
    json_mode: bool = True
@@ -138,6 +138,8 @@ def make_vlm_client(config: VlmConfig) -> VlmClient:
        return _make_vllm_client(config)
    if config.backend == "transformers":
        return _make_transformers_client(config)
+    if config.backend == "openai":
+        return _make_openai_client(config)
    raise ValueError(f"Unknown VLM backend: {config.backend!r}")


@@ -251,6 +253,86 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient:
    return _GenericTextClient(_gen, config)


+def _make_openai_client(config: VlmConfig) -> VlmClient:
+    """Backend that talks to any OpenAI-compatible server.
+
+    Compatible with ``vllm serve``, ``transformers serve``,
+    ``ktransformers serve``, and hosted endpoints. The server is
+    expected to be already running and to host ``config.model_id``.
+
+    Image blocks ``{"type":"image", "image":<PIL.Image>}`` are
+    auto-converted to ``image_url`` data-URLs. Video blocks
+    ``{"type":"video", "video":[<PIL>...]}`` are forwarded as
+    multi-frame ``video_url`` items where supported.
+    """
+    try:
+        from openai import OpenAI  # type: ignore[import-not-found]
+    except ImportError as exc:
+        raise ImportError(
+            "openai package is required for backend='openai'. "
+            "Install with `pip install openai`."
+        ) from exc
+
+    client = OpenAI(base_url=config.api_base, api_key=config.api_key)
+
+    def _gen(
+        batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float
+    ) -> list[str]:
+        outs: list[str] = []
+        for messages in batch:
+            api_messages = [_to_openai_message(m) for m in messages]
+            response = client.chat.completions.create(
+                model=config.model_id,
+                messages=api_messages,
+                max_tokens=max_tok,
+                temperature=temp,
+            )
+            outs.append(response.choices[0].message.content or "")
+        return outs
+
+    return _GenericTextClient(_gen, config)
+
+
+def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]:
+    """Convert an internal message dict to OpenAI chat format.
+
+    Internal image/video blocks (using PIL.Image objects) become
+    OpenAI ``image_url``/``video_url`` items via base64 data URLs.
+    """
+    content = message.get("content")
+    if not isinstance(content, list):
+        return {"role": message["role"], "content": content}
+    out_blocks: list[dict[str, Any]] = []
+    for block in content:
+        block_type = block.get("type") if isinstance(block, dict) else None
+        if block_type == "text":
+            out_blocks.append({"type": "text", "text": block.get("text", "")})
+        elif block_type == "image":
+            out_blocks.append(
+                {"type": "image_url", "image_url": {"url": _pil_to_data_url(block["image"])}}
+            )
+        elif block_type == "video":
+            frames = block.get("video", [])
+            for img in frames:
+                out_blocks.append(
+                    {"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}}
+                )
+        else:
+            out_blocks.append(block)
+    return {"role": message["role"], "content": out_blocks}
+
+
+def _pil_to_data_url(image: Any) -> str:
+    """Encode a PIL.Image as a base64 data URL."""
+    import base64  # noqa: PLC0415
+    import io  # noqa: PLC0415
+
+    buf = io.BytesIO()
+    image.save(buf, format="PNG")
+    b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+    return f"data:image/png;base64,{b64}"
+
+
 def _messages_to_prompt(messages: Sequence[dict[str, Any]]) -> Any:
    """Pass-through hook used by the vllm backend.