From 994ad880eedd837d7cfffcd890e923f15f7bf7f3 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 28 Apr 2026 23:47:52 +0200
Subject: [PATCH] fix(annotate): probe /v1/models for spawn-helper readiness

vllm with --uvicorn-log-level warning suppresses the "Uvicorn running"
banner that the readiness watcher waited for, so the spawn helper hung
forever even after the API was live. Add an HTTP probe in parallel with
the log watcher and broaden the log markers to include vllm's own
"Starting vLLM API server" / "Available routes are" lines.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../steerable_pipeline/vlm_client.py          | 39 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index e55e9bae4..0cb002d2c 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -455,7 +455,16 @@ def _spawn_parallel_inference_servers(config: VlmConfig) -> list[str]:
     api_bases: list[str] = []
     procs: list[subprocess.Popen] = []
     ready_events: list[threading.Event] = []
-    ready_markers = ("Uvicorn running", "Application startup complete")
+    # Multiple readiness signals — uvicorn's own banner is suppressed at
+    # ``--uvicorn-log-level warning``, so we also accept vllm's own
+    # "Starting vLLM API server" line and the route-listing line. The
+    # HTTP probe below is the ultimate fallback.
+    ready_markers = (
+        "Uvicorn running",
+        "Application startup complete",
+        "Starting vLLM API server",
+        "Available routes are",
+    )
     # Single lock for all server-stream threads so multibyte chars from
     # different servers don't interleave and tear UTF-8 sequences.
     print_lock = threading.Lock()
@@ -506,6 +515,16 @@ def _spawn_parallel_inference_servers(config: VlmConfig) -> list[str]:
 
         threading.Thread(target=_stream, args=(i, proc, ready), daemon=True).start()
 
+        def _probe(idx: int, base: str, ev: threading.Event, p: subprocess.Popen) -> None:
+            while not ev.is_set() and p.poll() is None:
+                if _server_is_up(base):
+                    print(f"[server-{idx}] ready (http probe)", flush=True)
+                    ev.set()
+                    return
+                time.sleep(2)
+
+        threading.Thread(target=_probe, args=(i, api_base, ready, proc), daemon=True).start()
+
     def _shutdown() -> None:
         for i, p in enumerate(procs):
             if p.poll() is None:
@@ -588,7 +607,23 @@ def _spawn_inference_server(config: VlmConfig) -> str:
     # rescans its cache on every model-list request, which can exceed
     # the urllib timeout and trigger an infinite probe loop.
     ready_event = threading.Event()
-    ready_markers = ("Uvicorn running", "Application startup complete")
+    # See _spawn_parallel_inference_servers for why we accept these.
+    ready_markers = (
+        "Uvicorn running",
+        "Application startup complete",
+        "Starting vLLM API server",
+        "Available routes are",
+    )
+
+    def _probe() -> None:
+        while not ready_event.is_set() and proc.poll() is None:
+            if _server_is_up(api_base):
+                print("[server] ready (http probe)", flush=True)
+                ready_event.set()
+                return
+            time.sleep(2)
+
+    threading.Thread(target=_probe, daemon=True).start()
 
     def _stream_output() -> None:
         # Read raw chunks instead of iterating lines so tqdm progress