feat(annotate): openai-compatible backend for transformers/ktransformers serve

Adds a third backend that talks to any OpenAI-compatible server. This unblocks Qwen3.6 (and other models) that work in transformers serve / ktransformers but not in vllm 0.10.2's fallback path: - launch the server out-of-process (transformers serve, vllm serve, ktransformers serve) - point lerobot-annotate at it via --vlm.backend=openai --vlm.api_base=http://localhost:8000/v1 --vlm.model_id=... Image and video blocks are converted to OpenAI image_url/video_url data URLs automatically. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 19:49:49 +00:00 · 2026-04-28 16:24:11 +02:00
parent 0782bbcd38
commit 91dedcad1e
2 changed files with 91 additions and 1 deletions
@@ -61,8 +61,16 @@ class VlmConfig:
    """Shared Qwen-VL client configuration."""
    backend: str = "vllm"
-    """One of ``vllm``, ``transformers``, or ``stub`` (tests only)."""
+    """One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests only).
    The ``openai`` backend talks to any OpenAI-compatible server — works
    with ``vllm serve``, ``transformers serve``, ``ktransformers serve``,
    or hosted endpoints. Set ``api_base`` and (optionally) ``api_key``."""
    model_id: str = "Qwen/Qwen3.6-27B-FP8"
    api_base: str = "http://localhost:8000/v1"
    """Base URL for the ``openai`` backend."""
    api_key: str = "EMPTY"
    """API key for the ``openai`` backend; ``EMPTY`` works for local servers."""
    max_new_tokens: int = 512
    temperature: float = 0.2
    json_mode: bool = True
@@ -138,6 +138,8 @@ def make_vlm_client(config: VlmConfig) -> VlmClient:
        return _make_vllm_client(config)
    if config.backend == "transformers":
        return _make_transformers_client(config)
    if config.backend == "openai":
        return _make_openai_client(config)
    raise ValueError(f"Unknown VLM backend: {config.backend!r}")
@@ -251,6 +253,86 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient:
    return _GenericTextClient(_gen, config)
 def _make_openai_client(config: VlmConfig) -> VlmClient:
    """Backend that talks to any OpenAI-compatible server.
    Compatible with ``vllm serve``, ``transformers serve``,
    ``ktransformers serve``, and hosted endpoints. The server is
    expected to be already running and to host ``config.model_id``.
    Image blocks ``{"type":"image", "image":<PIL.Image>}`` are
    auto-converted to ``image_url`` data-URLs. Video blocks
    ``{"type":"video", "video":[<PIL>...]}`` are forwarded as
    multi-frame ``video_url`` items where supported.
    """
    try:
        from openai import OpenAI  # type: ignore[import-not-found]
    except ImportError as exc:
        raise ImportError(
            "openai package is required for backend='openai'. "
            "Install with `pip install openai`."
        ) from exc
    client = OpenAI(base_url=config.api_base, api_key=config.api_key)
    def _gen(
        batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float
    ) -> list[str]:
        outs: list[str] = []
        for messages in batch:
            api_messages = [_to_openai_message(m) for m in messages]
            response = client.chat.completions.create(
                model=config.model_id,
                messages=api_messages,
                max_tokens=max_tok,
                temperature=temp,
            )
            outs.append(response.choices[0].message.content or "")
        return outs
    return _GenericTextClient(_gen, config)
 def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]:
    """Convert an internal message dict to OpenAI chat format.
    Internal image/video blocks (using PIL.Image objects) become
    OpenAI ``image_url``/``video_url`` items via base64 data URLs.
    """
    content = message.get("content")
    if not isinstance(content, list):
        return {"role": message["role"], "content": content}
    out_blocks: list[dict[str, Any]] = []
    for block in content:
        block_type = block.get("type") if isinstance(block, dict) else None
        if block_type == "text":
            out_blocks.append({"type": "text", "text": block.get("text", "")})
        elif block_type == "image":
            out_blocks.append(
                {"type": "image_url", "image_url": {"url": _pil_to_data_url(block["image"])}}
            )
        elif block_type == "video":
            frames = block.get("video", [])
            for img in frames:
                out_blocks.append(
                    {"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}}
                )
        else:
            out_blocks.append(block)
    return {"role": message["role"], "content": out_blocks}
 def _pil_to_data_url(image: Any) -> str:
    """Encode a PIL.Image as a base64 data URL."""
    import base64  # noqa: PLC0415
    import io  # noqa: PLC0415
    buf = io.BytesIO()
    image.save(buf, format="PNG")
    b64 = base64.b64encode(buf.getvalue()).decode("ascii")
    return f"data:image/png;base64,{b64}"
 def _messages_to_prompt(messages: Sequence[dict[str, Any]]) -> Any:
    """Pass-through hook used by the vllm backend.