From a27972125bffe6b1c237919d9793cdf9815ae456 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 28 Apr 2026 16:24:11 +0200 Subject: [PATCH] feat(annotate): openai-compatible backend for transformers/ktransformers serve Adds a third backend that talks to any OpenAI-compatible server. This unblocks Qwen3.6 (and other models) that work in transformers serve / ktransformers but not in vllm 0.10.2's fallback path: - launch the server out-of-process (transformers serve, vllm serve, ktransformers serve) - point lerobot-annotate at it via --vlm.backend=openai --vlm.api_base=http://localhost:8000/v1 --vlm.model_id=... Image and video blocks are converted to OpenAI image_url/video_url data URLs automatically. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../annotations/steerable_pipeline/config.py | 10 ++- .../steerable_pipeline/vlm_client.py | 82 +++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 347d75225..1d6786d3f 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -61,8 +61,16 @@ class VlmConfig: """Shared Qwen-VL client configuration.""" backend: str = "vllm" - """One of ``vllm``, ``transformers``, or ``stub`` (tests only).""" + """One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests only). + + The ``openai`` backend talks to any OpenAI-compatible server — works + with ``vllm serve``, ``transformers serve``, ``ktransformers serve``, + or hosted endpoints. Set ``api_base`` and (optionally) ``api_key``.""" model_id: str = "Qwen/Qwen3.6-27B-FP8" + api_base: str = "http://localhost:8000/v1" + """Base URL for the ``openai`` backend.""" + api_key: str = "EMPTY" + """API key for the ``openai`` backend; ``EMPTY`` works for local servers.""" max_new_tokens: int = 512 temperature: float = 0.2 json_mode: bool = True diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index ff0d07e5e..e3f443c60 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -138,6 +138,8 @@ def make_vlm_client(config: VlmConfig) -> VlmClient: return _make_vllm_client(config) if config.backend == "transformers": return _make_transformers_client(config) + if config.backend == "openai": + return _make_openai_client(config) raise ValueError(f"Unknown VLM backend: {config.backend!r}") @@ -251,6 +253,86 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient: return _GenericTextClient(_gen, config) +def _make_openai_client(config: VlmConfig) -> VlmClient: + """Backend that talks to any OpenAI-compatible server. + + Compatible with ``vllm serve``, ``transformers serve``, + ``ktransformers serve``, and hosted endpoints. The server is + expected to be already running and to host ``config.model_id``. + + Image blocks ``{"type":"image", "image":}`` are + auto-converted to ``image_url`` data-URLs. Video blocks + ``{"type":"video", "video":[...]}`` are forwarded as + multi-frame ``video_url`` items where supported. + """ + try: + from openai import OpenAI # type: ignore[import-not-found] + except ImportError as exc: + raise ImportError( + "openai package is required for backend='openai'. " + "Install with `pip install openai`." + ) from exc + + client = OpenAI(base_url=config.api_base, api_key=config.api_key) + + def _gen( + batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float + ) -> list[str]: + outs: list[str] = [] + for messages in batch: + api_messages = [_to_openai_message(m) for m in messages] + response = client.chat.completions.create( + model=config.model_id, + messages=api_messages, + max_tokens=max_tok, + temperature=temp, + ) + outs.append(response.choices[0].message.content or "") + return outs + + return _GenericTextClient(_gen, config) + + +def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]: + """Convert an internal message dict to OpenAI chat format. + + Internal image/video blocks (using PIL.Image objects) become + OpenAI ``image_url``/``video_url`` items via base64 data URLs. + """ + content = message.get("content") + if not isinstance(content, list): + return {"role": message["role"], "content": content} + out_blocks: list[dict[str, Any]] = [] + for block in content: + block_type = block.get("type") if isinstance(block, dict) else None + if block_type == "text": + out_blocks.append({"type": "text", "text": block.get("text", "")}) + elif block_type == "image": + out_blocks.append( + {"type": "image_url", "image_url": {"url": _pil_to_data_url(block["image"])}} + ) + elif block_type == "video": + frames = block.get("video", []) + for img in frames: + out_blocks.append( + {"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}} + ) + else: + out_blocks.append(block) + return {"role": message["role"], "content": out_blocks} + + +def _pil_to_data_url(image: Any) -> str: + """Encode a PIL.Image as a base64 data URL.""" + import base64 # noqa: PLC0415 + import io # noqa: PLC0415 + + buf = io.BytesIO() + image.save(buf, format="PNG") + b64 = base64.b64encode(buf.getvalue()).decode("ascii") + return f"data:image/png;base64,{b64}" + + def _messages_to_prompt(messages: Sequence[dict[str, Any]]) -> Any: """Pass-through hook used by the vllm backend.