mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-21 19:49:49 +00:00
feat(annotate): openai-compatible backend for transformers/ktransformers serve
Adds a third backend that talks to any OpenAI-compatible server. This unblocks Qwen3.6 (and other models) that work in transformers serve / ktransformers but not in vllm 0.10.2's fallback path: - launch the server out-of-process (transformers serve, vllm serve, ktransformers serve) - point lerobot-annotate at it via --vlm.backend=openai --vlm.api_base=http://localhost:8000/v1 --vlm.model_id=... Image and video blocks are converted to OpenAI image_url/video_url data URLs automatically. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -61,8 +61,16 @@ class VlmConfig:
|
|||||||
"""Shared Qwen-VL client configuration."""
|
"""Shared Qwen-VL client configuration."""
|
||||||
|
|
||||||
backend: str = "vllm"
|
backend: str = "vllm"
|
||||||
"""One of ``vllm``, ``transformers``, or ``stub`` (tests only)."""
|
"""One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests only).
|
||||||
|
|
||||||
|
The ``openai`` backend talks to any OpenAI-compatible server — works
|
||||||
|
with ``vllm serve``, ``transformers serve``, ``ktransformers serve``,
|
||||||
|
or hosted endpoints. Set ``api_base`` and (optionally) ``api_key``."""
|
||||||
model_id: str = "Qwen/Qwen3.6-27B-FP8"
|
model_id: str = "Qwen/Qwen3.6-27B-FP8"
|
||||||
|
api_base: str = "http://localhost:8000/v1"
|
||||||
|
"""Base URL for the ``openai`` backend."""
|
||||||
|
api_key: str = "EMPTY"
|
||||||
|
"""API key for the ``openai`` backend; ``EMPTY`` works for local servers."""
|
||||||
max_new_tokens: int = 512
|
max_new_tokens: int = 512
|
||||||
temperature: float = 0.2
|
temperature: float = 0.2
|
||||||
json_mode: bool = True
|
json_mode: bool = True
|
||||||
|
|||||||
@@ -138,6 +138,8 @@ def make_vlm_client(config: VlmConfig) -> VlmClient:
|
|||||||
return _make_vllm_client(config)
|
return _make_vllm_client(config)
|
||||||
if config.backend == "transformers":
|
if config.backend == "transformers":
|
||||||
return _make_transformers_client(config)
|
return _make_transformers_client(config)
|
||||||
|
if config.backend == "openai":
|
||||||
|
return _make_openai_client(config)
|
||||||
raise ValueError(f"Unknown VLM backend: {config.backend!r}")
|
raise ValueError(f"Unknown VLM backend: {config.backend!r}")
|
||||||
|
|
||||||
|
|
||||||
@@ -251,6 +253,86 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient:
|
|||||||
return _GenericTextClient(_gen, config)
|
return _GenericTextClient(_gen, config)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_openai_client(config: VlmConfig) -> VlmClient:
|
||||||
|
"""Backend that talks to any OpenAI-compatible server.
|
||||||
|
|
||||||
|
Compatible with ``vllm serve``, ``transformers serve``,
|
||||||
|
``ktransformers serve``, and hosted endpoints. The server is
|
||||||
|
expected to be already running and to host ``config.model_id``.
|
||||||
|
|
||||||
|
Image blocks ``{"type":"image", "image":<PIL.Image>}`` are
|
||||||
|
auto-converted to ``image_url`` data-URLs. Video blocks
|
||||||
|
``{"type":"video", "video":[<PIL>...]}`` are forwarded as
|
||||||
|
multi-frame ``video_url`` items where supported.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from openai import OpenAI # type: ignore[import-not-found]
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"openai package is required for backend='openai'. "
|
||||||
|
"Install with `pip install openai`."
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
client = OpenAI(base_url=config.api_base, api_key=config.api_key)
|
||||||
|
|
||||||
|
def _gen(
|
||||||
|
batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float
|
||||||
|
) -> list[str]:
|
||||||
|
outs: list[str] = []
|
||||||
|
for messages in batch:
|
||||||
|
api_messages = [_to_openai_message(m) for m in messages]
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=config.model_id,
|
||||||
|
messages=api_messages,
|
||||||
|
max_tokens=max_tok,
|
||||||
|
temperature=temp,
|
||||||
|
)
|
||||||
|
outs.append(response.choices[0].message.content or "")
|
||||||
|
return outs
|
||||||
|
|
||||||
|
return _GenericTextClient(_gen, config)
|
||||||
|
|
||||||
|
|
||||||
|
def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
"""Convert an internal message dict to OpenAI chat format.
|
||||||
|
|
||||||
|
Internal image/video blocks (using PIL.Image objects) become
|
||||||
|
OpenAI ``image_url``/``video_url`` items via base64 data URLs.
|
||||||
|
"""
|
||||||
|
content = message.get("content")
|
||||||
|
if not isinstance(content, list):
|
||||||
|
return {"role": message["role"], "content": content}
|
||||||
|
out_blocks: list[dict[str, Any]] = []
|
||||||
|
for block in content:
|
||||||
|
block_type = block.get("type") if isinstance(block, dict) else None
|
||||||
|
if block_type == "text":
|
||||||
|
out_blocks.append({"type": "text", "text": block.get("text", "")})
|
||||||
|
elif block_type == "image":
|
||||||
|
out_blocks.append(
|
||||||
|
{"type": "image_url", "image_url": {"url": _pil_to_data_url(block["image"])}}
|
||||||
|
)
|
||||||
|
elif block_type == "video":
|
||||||
|
frames = block.get("video", [])
|
||||||
|
for img in frames:
|
||||||
|
out_blocks.append(
|
||||||
|
{"type": "image_url", "image_url": {"url": _pil_to_data_url(img)}}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
out_blocks.append(block)
|
||||||
|
return {"role": message["role"], "content": out_blocks}
|
||||||
|
|
||||||
|
|
||||||
|
def _pil_to_data_url(image: Any) -> str:
|
||||||
|
"""Encode a PIL.Image as a base64 data URL."""
|
||||||
|
import base64 # noqa: PLC0415
|
||||||
|
import io # noqa: PLC0415
|
||||||
|
|
||||||
|
buf = io.BytesIO()
|
||||||
|
image.save(buf, format="PNG")
|
||||||
|
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
|
||||||
|
return f"data:image/png;base64,{b64}"
|
||||||
|
|
||||||
|
|
||||||
def _messages_to_prompt(messages: Sequence[dict[str, Any]]) -> Any:
|
def _messages_to_prompt(messages: Sequence[dict[str, Any]]) -> Any:
|
||||||
"""Pass-through hook used by the vllm backend.
|
"""Pass-through hook used by the vllm backend.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user