From 7e91482e3a447dea53572e9c44d9bf1f9ec6844f Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 29 Apr 2026 00:53:08 +0200
Subject: [PATCH] refactor(annotate): drop HF Inference Providers code path

Default backend is now a local OpenAI-compatible server (vllm /
transformers) which auto_serve spawns. Removes the
use_hf_inference_providers config flag and the router.huggingface.co
routing branch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py  | 16 +----
 .../steerable_pipeline/vlm_client.py          | 59 ++++++-------------
 2 files changed, 21 insertions(+), 54 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 356eed290..81e1a6a13 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -82,23 +82,13 @@ class VlmConfig:
     backend: str = "openai"
     """One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests only).
 
-    Default ``openai`` paired with ``use_hf_inference_providers=True``
-    routes requests through HF Inference Providers — no local GPU
-    needed. Switch to ``vllm`` / ``transformers`` for in-process
-    inference."""
-    model_id: str = "Qwen/Qwen3-VL-30B-A3B-Instruct:novita"
+    Default ``openai`` talks to a local OpenAI-compatible server (vllm /
+    transformers) which the CLI auto-spawns when ``auto_serve=True``."""
+    model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct"
     api_base: str = "http://localhost:8000/v1"
     """Base URL for the ``openai`` backend."""
     api_key: str = "EMPTY"
     """API key for the ``openai`` backend; ``EMPTY`` works for local servers."""
-    use_hf_inference_providers: bool = True
-    """Route requests through https://router.huggingface.co/v1 using your
-    ``HF_TOKEN`` env var as the API key. Default ``True`` — no local GPU
-    needed. The CLI flips ``auto_serve`` off automatically when this is
-    set. Use ``model_id`` of the form
-    ``Qwen/Qwen3-VL-30B-A3B-Instruct:novita`` to pin a specific provider,
-    or omit ``:provider`` to let HF route. Set ``False`` to fall back to
-    a local server (vllm serve / transformers serve / external)."""
     auto_serve: bool = True
     """When True with ``backend=openai``, the CLI probes ``api_base``
     first; if no server answers, it spawns one (default:
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index 0cb002d2c..1f1f83037 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -336,48 +336,25 @@ def _make_openai_client(config: VlmConfig) -> VlmClient:
     auto_serve = config.auto_serve
     api_bases: list[str] = [api_base]
 
-    if config.use_hf_inference_providers:
-        api_base = "https://router.huggingface.co/v1"
-        token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") or ""
-        if not token:
-            try:
-                from huggingface_hub import get_token  # noqa: PLC0415
-
-                token = get_token() or ""
-            except Exception:  # noqa: BLE001
-                token = ""
-        if not token:
-            raise RuntimeError(
-                "use_hf_inference_providers=True needs an HF token. Either set "
-                "HF_TOKEN in the environment, or run `huggingface-cli login` once."
+    print(
+        f"[lerobot-annotate] backend=openai model={config.model_id} "
+        f"api_base={api_base} auto_serve={auto_serve}",
+        flush=True,
+    )
+    if auto_serve:
+        if config.parallel_servers > 1:
+            print(
+                f"[lerobot-annotate] spawning {config.parallel_servers} parallel servers",
+                flush=True,
             )
-        api_key = token
-        auto_serve = False
-        print(
-            f"[lerobot-annotate] HF Inference Providers: routing model={config.model_id} "
-            f"via {api_base}",
-            flush=True,
-        )
-    else:
-        print(
-            f"[lerobot-annotate] backend=openai model={config.model_id} "
-            f"api_base={api_base} auto_serve={auto_serve}",
-            flush=True,
-        )
-        if auto_serve:
-            if config.parallel_servers > 1:
-                print(
-                    f"[lerobot-annotate] spawning {config.parallel_servers} parallel servers",
-                    flush=True,
-                )
-                api_bases = _spawn_parallel_inference_servers(config)
-            elif _server_is_up(api_base):
-                print(f"[lerobot-annotate] reusing server already up at {api_base}", flush=True)
-            else:
-                print("[lerobot-annotate] no server reachable; spawning one", flush=True)
-                api_base = _spawn_inference_server(config)
-                api_bases = [api_base]
-                print(f"[lerobot-annotate] server ready at {api_base}", flush=True)
+            api_bases = _spawn_parallel_inference_servers(config)
+        elif _server_is_up(api_base):
+            print(f"[lerobot-annotate] reusing server already up at {api_base}", flush=True)
+        else:
+            print("[lerobot-annotate] no server reachable; spawning one", flush=True)
+            api_base = _spawn_inference_server(config)
+            api_bases = [api_base]
+            print(f"[lerobot-annotate] server ready at {api_base}", flush=True)
 
     clients = [OpenAI(base_url=base, api_key=api_key) for base in api_bases]
     client = clients[0]