mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-15 08:39:49 +00:00
feat(annotate): openai-compatible backend for transformers/ktransformers serve
Adds a third backend that talks to any OpenAI-compatible server. This unblocks Qwen3.6 (and other models) that work in transformers serve / ktransformers but not in vllm 0.10.2's fallback path: - launch the server out-of-process (transformers serve, vllm serve, ktransformers serve) - point lerobot-annotate at it via --vlm.backend=openai --vlm.api_base=http://localhost:8000/v1 --vlm.model_id=... Image and video blocks are converted to OpenAI image_url/video_url data URLs automatically. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -61,8 +61,16 @@ class VlmConfig:
|
||||
"""Shared Qwen-VL client configuration."""
|
||||
|
||||
backend: str = "vllm"
|
||||
"""One of ``vllm``, ``transformers``, or ``stub`` (tests only)."""
|
||||
"""One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests only).
|
||||
|
||||
The ``openai`` backend talks to any OpenAI-compatible server — works
|
||||
with ``vllm serve``, ``transformers serve``, ``ktransformers serve``,
|
||||
or hosted endpoints. Set ``api_base`` and (optionally) ``api_key``."""
|
||||
model_id: str = "Qwen/Qwen3.6-27B-FP8"
|
||||
api_base: str = "http://localhost:8000/v1"
|
||||
"""Base URL for the ``openai`` backend."""
|
||||
api_key: str = "EMPTY"
|
||||
"""API key for the ``openai`` backend; ``EMPTY`` works for local servers."""
|
||||
max_new_tokens: int = 512
|
||||
temperature: float = 0.2
|
||||
json_mode: bool = True
|
||||
|
||||
@@ -138,6 +138,8 @@ def make_vlm_client(config: VlmConfig) -> VlmClient:
|
||||
return _make_vllm_client(config)
|
||||
if config.backend == "transformers":
|
||||
return _make_transformers_client(config)
|
||||
if config.backend == "openai":
|
||||
return _make_openai_client(config)
|
||||
raise ValueError(f"Unknown VLM backend: {config.backend!r}")
|
||||
|
||||
|
||||
@@ -251,6 +253,86 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient:
|
||||
return _GenericTextClient(_gen, config)
|
||||
|
||||
|
||||
def _make_openai_client(config: VlmConfig) -> VlmClient:
|
||||
"""Backend that talks to any OpenAI-compatible server.
|
||||
|
||||
Compatible with ``vllm serve``, ``transformers serve``,
|
||||
``ktransformers serve``, and hosted endpoints. The server is
|
||||
expected to be already running and to host ``config.model_id``.
|
||||
|
||||
Image blocks ``{"type":"image", "image":<PIL.Image>}`` are
|
||||
auto-converted to ``image_url`` data-URLs. Video blocks
|
||||
``{"type":"video", "video":[<PIL>...]}`` are forwarded as
|
||||
multi-frame ``video_url`` items where supported.
|
||||
"""
|
||||
try:
|
||||
from openai import OpenAI # type: ignore[import-not-found]
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"openai package is required for backend='openai'. "
|
||||
"Install with `pip install openai`."
|
||||
) from exc
|
||||
|
||||
client = OpenAI(base_url=config.api_base, api_key=config.api_key)
|
||||
|
||||
def _gen(
|
||||
batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float
|
||||
) -> list[str]:
|
||||
outs: list[str] = []
|
||||
for messages in batch:
|
||||
api_messages = [_to_openai_message(m) for m in messages]
|
||||
response = client.chat.completions.create(
|
||||
model=config.model_id,
|
||||
messages=api_messages,
|
||||
max_tokens=max_tok,
|
||||
temperature=temp,
|
||||
)
|
||||
outs.append(response.choices[0].message.content or "")
|
||||
return outs
|
||||
|
||||
return _GenericTextClient(_gen, config)
|
||||
|
||||
|
||||
def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Convert an internal message dict to OpenAI chat format.
|
||||
|
||||
Internal image/video blocks (using PIL.Image objects) become
|
||||
OpenAI ``image_url``/``video_url`` items via base64 data URLs.
|
||||
"""
|
||||
content = message.get("content")
|
||||
if not isinstance(content, list):
|
||||
return {"role": message["role"], "content": content}
|
||||
out_blocks: list[dict[str, Any]] = []
|
||||
for block in content:
|
||||
block_type = block.get("type") if isinstance(block, dict) else None
|
||||
if block_type == "text":
|
||||
out_blocks.append({"type": "text", "text": block.get("text", "")})
|
||||
elif block_type == "image":
|
||||
out_blocks.append(
|
||||
{"type": "image_url", "image_url": {"url": _pil_to_data_url(block["image"])}}
|
||||
)
|
||||
elif block_type == "video":
|
||||
frames = block.get("video", [])
|
||||
for img in frames:
|
||||
out_blocks.append(
|
||||
{"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}}
|
||||
)
|
||||
else:
|
||||
out_blocks.append(block)
|
||||
return {"role": message["role"], "content": out_blocks}
|
||||
|
||||
|
||||
def _pil_to_data_url(image: Any) -> str:
|
||||
"""Encode a PIL.Image as a base64 data URL."""
|
||||
import base64 # noqa: PLC0415
|
||||
import io # noqa: PLC0415
|
||||
|
||||
buf = io.BytesIO()
|
||||
image.save(buf, format="PNG")
|
||||
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
|
||||
return f"data:image/png;base64,{b64}"
|
||||
|
||||
|
||||
def _messages_to_prompt(messages: Sequence[dict[str, Any]]) -> Any:
|
||||
"""Pass-through hook used by the vllm backend.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user