From d5559a9445e3740f9a558b2728c94f001bbcd6fa Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 28 Apr 2026 19:28:13 +0200
Subject: [PATCH] feat(annotate): one-flag HF Inference Providers backend

Setting --vlm.use_hf_inference_providers=true routes requests through
https://router.huggingface.co/v1 using HF_TOKEN as the API key, and
disables auto_serve so no local server is spawned. Combine with a
provider-pinned model id like 'Qwen/Qwen3-VL-30B-A3B-Instruct:novita'
or any plain model id to let HF route.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py  |  6 +++
 .../steerable_pipeline/vlm_client.py          | 45 +++++++++++++------
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index dcabc8345..6f5e64c6d 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -83,6 +83,12 @@ class VlmConfig:
     """Base URL for the ``openai`` backend."""
     api_key: str = "EMPTY"
     """API key for the ``openai`` backend; ``EMPTY`` works for local servers."""
+    use_hf_inference_providers: bool = False
+    """When True, route requests through https://router.huggingface.co/v1
+    using your ``HF_TOKEN`` env var as the API key. The CLI flips
+    ``auto_serve`` off automatically — no local server is spawned. Use
+    ``model_id`` of the form ``Qwen/Qwen3-VL-30B-A3B-Instruct:novita`` to
+    pin a specific provider, or omit ``:provider`` to let HF route."""
     auto_serve: bool = True
     """When True with ``backend=openai``, the CLI probes ``api_base``
     first; if no server answers, it spawns one (default:
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index 1c58363b2..8fe40b785 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -277,20 +277,39 @@ def _make_openai_client(config: VlmConfig) -> VlmClient:
         ) from exc
 
     api_base = config.api_base
-    print(
-        f"[lerobot-annotate] backend=openai model={config.model_id} "
-        f"api_base={api_base} auto_serve={config.auto_serve}",
-        flush=True,
-    )
-    if config.auto_serve:
-        if _server_is_up(api_base):
-            print(f"[lerobot-annotate] reusing server already up at {api_base}", flush=True)
-        else:
-            print("[lerobot-annotate] no server reachable; spawning one", flush=True)
-            api_base = _spawn_inference_server(config)
-            print(f"[lerobot-annotate] server ready at {api_base}", flush=True)
+    api_key = config.api_key
+    auto_serve = config.auto_serve
 
-    client = OpenAI(base_url=api_base, api_key=config.api_key)
+    if config.use_hf_inference_providers:
+        api_base = "https://router.huggingface.co/v1"
+        token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") or ""
+        if not token:
+            raise RuntimeError(
+                "use_hf_inference_providers=True requires HF_TOKEN (or "
+                "HUGGINGFACE_API_KEY) in the environment."
+            )
+        api_key = token
+        auto_serve = False
+        print(
+            f"[lerobot-annotate] HF Inference Providers: routing model={config.model_id} "
+            f"via {api_base}",
+            flush=True,
+        )
+    else:
+        print(
+            f"[lerobot-annotate] backend=openai model={config.model_id} "
+            f"api_base={api_base} auto_serve={auto_serve}",
+            flush=True,
+        )
+        if auto_serve:
+            if _server_is_up(api_base):
+                print(f"[lerobot-annotate] reusing server already up at {api_base}", flush=True)
+            else:
+                print("[lerobot-annotate] no server reachable; spawning one", flush=True)
+                api_base = _spawn_inference_server(config)
+                print(f"[lerobot-annotate] server ready at {api_base}", flush=True)
+
+    client = OpenAI(base_url=api_base, api_key=api_key)
 
     # ``mm_processor_kwargs`` is a vllm-specific extra; transformers serve
     # rejects it with HTTP 422. Send it only when explicitly opted in via