From d5559a9445e3740f9a558b2728c94f001bbcd6fa Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 28 Apr 2026 19:28:13 +0200 Subject: [PATCH] feat(annotate): one-flag HF Inference Providers backend Setting --vlm.use_hf_inference_providers=true routes requests through https://router.huggingface.co/v1 using HF_TOKEN as the API key, and disables auto_serve so no local server is spawned. Combine with a provider-pinned model id like 'Qwen/Qwen3-VL-30B-A3B-Instruct:novita' or any plain model id to let HF route. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../annotations/steerable_pipeline/config.py | 6 +++ .../steerable_pipeline/vlm_client.py | 45 +++++++++++++------ 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index dcabc8345..6f5e64c6d 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -83,6 +83,12 @@ class VlmConfig: """Base URL for the ``openai`` backend.""" api_key: str = "EMPTY" """API key for the ``openai`` backend; ``EMPTY`` works for local servers.""" + use_hf_inference_providers: bool = False + """When True, route requests through https://router.huggingface.co/v1 + using your ``HF_TOKEN`` env var as the API key. The CLI flips + ``auto_serve`` off automatically — no local server is spawned. Use + ``model_id`` of the form ``Qwen/Qwen3-VL-30B-A3B-Instruct:novita`` to + pin a specific provider, or omit ``:provider`` to let HF route.""" auto_serve: bool = True """When True with ``backend=openai``, the CLI probes ``api_base`` first; if no server answers, it spawns one (default: diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index 1c58363b2..8fe40b785 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -277,20 +277,39 @@ def _make_openai_client(config: VlmConfig) -> VlmClient: ) from exc api_base = config.api_base - print( - f"[lerobot-annotate] backend=openai model={config.model_id} " - f"api_base={api_base} auto_serve={config.auto_serve}", - flush=True, - ) - if config.auto_serve: - if _server_is_up(api_base): - print(f"[lerobot-annotate] reusing server already up at {api_base}", flush=True) - else: - print("[lerobot-annotate] no server reachable; spawning one", flush=True) - api_base = _spawn_inference_server(config) - print(f"[lerobot-annotate] server ready at {api_base}", flush=True) + api_key = config.api_key + auto_serve = config.auto_serve - client = OpenAI(base_url=api_base, api_key=config.api_key) + if config.use_hf_inference_providers: + api_base = "https://router.huggingface.co/v1" + token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") or "" + if not token: + raise RuntimeError( + "use_hf_inference_providers=True requires HF_TOKEN (or " + "HUGGINGFACE_API_KEY) in the environment." + ) + api_key = token + auto_serve = False + print( + f"[lerobot-annotate] HF Inference Providers: routing model={config.model_id} " + f"via {api_base}", + flush=True, + ) + else: + print( + f"[lerobot-annotate] backend=openai model={config.model_id} " + f"api_base={api_base} auto_serve={auto_serve}", + flush=True, + ) + if auto_serve: + if _server_is_up(api_base): + print(f"[lerobot-annotate] reusing server already up at {api_base}", flush=True) + else: + print("[lerobot-annotate] no server reachable; spawning one", flush=True) + api_base = _spawn_inference_server(config) + print(f"[lerobot-annotate] server ready at {api_base}", flush=True) + + client = OpenAI(base_url=api_base, api_key=api_key) # ``mm_processor_kwargs`` is a vllm-specific extra; transformers serve # rejects it with HTTP 422. Send it only when explicitly opted in via