From 7e91482e3a447dea53572e9c44d9bf1f9ec6844f Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 29 Apr 2026 00:53:08 +0200 Subject: [PATCH] refactor(annotate): drop HF Inference Providers code path Default backend is now a local OpenAI-compatible server (vllm / transformers) which auto_serve spawns. Removes the use_hf_inference_providers config flag and the router.huggingface.co routing branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../annotations/steerable_pipeline/config.py | 16 +---- .../steerable_pipeline/vlm_client.py | 59 ++++++------------- 2 files changed, 21 insertions(+), 54 deletions(-) diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 356eed290..81e1a6a13 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -82,23 +82,13 @@ class VlmConfig: backend: str = "openai" """One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests only). - Default ``openai`` paired with ``use_hf_inference_providers=True`` - routes requests through HF Inference Providers — no local GPU - needed. Switch to ``vllm`` / ``transformers`` for in-process - inference.""" - model_id: str = "Qwen/Qwen3-VL-30B-A3B-Instruct:novita" + Default ``openai`` talks to a local OpenAI-compatible server (vllm / + transformers) which the CLI auto-spawns when ``auto_serve=True``.""" + model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct" api_base: str = "http://localhost:8000/v1" """Base URL for the ``openai`` backend.""" api_key: str = "EMPTY" """API key for the ``openai`` backend; ``EMPTY`` works for local servers.""" - use_hf_inference_providers: bool = True - """Route requests through https://router.huggingface.co/v1 using your - ``HF_TOKEN`` env var as the API key. Default ``True`` — no local GPU - needed. The CLI flips ``auto_serve`` off automatically when this is - set. Use ``model_id`` of the form - ``Qwen/Qwen3-VL-30B-A3B-Instruct:novita`` to pin a specific provider, - or omit ``:provider`` to let HF route. Set ``False`` to fall back to - a local server (vllm serve / transformers serve / external).""" auto_serve: bool = True """When True with ``backend=openai``, the CLI probes ``api_base`` first; if no server answers, it spawns one (default: diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index 0cb002d2c..1f1f83037 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -336,48 +336,25 @@ def _make_openai_client(config: VlmConfig) -> VlmClient: auto_serve = config.auto_serve api_bases: list[str] = [api_base] - if config.use_hf_inference_providers: - api_base = "https://router.huggingface.co/v1" - token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") or "" - if not token: - try: - from huggingface_hub import get_token # noqa: PLC0415 - - token = get_token() or "" - except Exception: # noqa: BLE001 - token = "" - if not token: - raise RuntimeError( - "use_hf_inference_providers=True needs an HF token. Either set " - "HF_TOKEN in the environment, or run `huggingface-cli login` once." + print( + f"[lerobot-annotate] backend=openai model={config.model_id} " + f"api_base={api_base} auto_serve={auto_serve}", + flush=True, + ) + if auto_serve: + if config.parallel_servers > 1: + print( + f"[lerobot-annotate] spawning {config.parallel_servers} parallel servers", + flush=True, ) - api_key = token - auto_serve = False - print( - f"[lerobot-annotate] HF Inference Providers: routing model={config.model_id} " - f"via {api_base}", - flush=True, - ) - else: - print( - f"[lerobot-annotate] backend=openai model={config.model_id} " - f"api_base={api_base} auto_serve={auto_serve}", - flush=True, - ) - if auto_serve: - if config.parallel_servers > 1: - print( - f"[lerobot-annotate] spawning {config.parallel_servers} parallel servers", - flush=True, - ) - api_bases = _spawn_parallel_inference_servers(config) - elif _server_is_up(api_base): - print(f"[lerobot-annotate] reusing server already up at {api_base}", flush=True) - else: - print("[lerobot-annotate] no server reachable; spawning one", flush=True) - api_base = _spawn_inference_server(config) - api_bases = [api_base] - print(f"[lerobot-annotate] server ready at {api_base}", flush=True) + api_bases = _spawn_parallel_inference_servers(config) + elif _server_is_up(api_base): + print(f"[lerobot-annotate] reusing server already up at {api_base}", flush=True) + else: + print("[lerobot-annotate] no server reachable; spawning one", flush=True) + api_base = _spawn_inference_server(config) + api_bases = [api_base] + print(f"[lerobot-annotate] server ready at {api_base}", flush=True) clients = [OpenAI(base_url=base, api_key=api_key) for base in api_bases] client = clients[0]