From 994ad880eedd837d7cfffcd890e923f15f7bf7f3 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 28 Apr 2026 23:47:52 +0200 Subject: [PATCH] fix(annotate): probe /v1/models for spawn-helper readiness vllm with --uvicorn-log-level warning suppresses the "Uvicorn running" banner that the readiness watcher waited for, so the spawn helper hung forever even after the API was live. Add an HTTP probe in parallel with the log watcher and broaden the log markers to include vllm's own "Starting vLLM API server" / "Available routes are" lines. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../steerable_pipeline/vlm_client.py | 39 ++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index e55e9bae4..0cb002d2c 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -455,7 +455,16 @@ def _spawn_parallel_inference_servers(config: VlmConfig) -> list[str]: api_bases: list[str] = [] procs: list[subprocess.Popen] = [] ready_events: list[threading.Event] = [] - ready_markers = ("Uvicorn running", "Application startup complete") + # Multiple readiness signals — uvicorn's own banner is suppressed at + # ``--uvicorn-log-level warning``, so we also accept vllm's own + # "Starting vLLM API server" line and the route-listing line. The + # HTTP probe below is the ultimate fallback. + ready_markers = ( + "Uvicorn running", + "Application startup complete", + "Starting vLLM API server", + "Available routes are", + ) # Single lock for all server-stream threads so multibyte chars from # different servers don't interleave and tear UTF-8 sequences. print_lock = threading.Lock() @@ -506,6 +515,16 @@ def _spawn_parallel_inference_servers(config: VlmConfig) -> list[str]: threading.Thread(target=_stream, args=(i, proc, ready), daemon=True).start() + def _probe(idx: int, base: str, ev: threading.Event, p: subprocess.Popen) -> None: + while not ev.is_set() and p.poll() is None: + if _server_is_up(base): + print(f"[server-{idx}] ready (http probe)", flush=True) + ev.set() + return + time.sleep(2) + + threading.Thread(target=_probe, args=(i, api_base, ready, proc), daemon=True).start() + def _shutdown() -> None: for i, p in enumerate(procs): if p.poll() is None: @@ -588,7 +607,23 @@ def _spawn_inference_server(config: VlmConfig) -> str: # rescans its cache on every model-list request, which can exceed # the urllib timeout and trigger an infinite probe loop. ready_event = threading.Event() - ready_markers = ("Uvicorn running", "Application startup complete") + # See _spawn_parallel_inference_servers for why we accept these. + ready_markers = ( + "Uvicorn running", + "Application startup complete", + "Starting vLLM API server", + "Available routes are", + ) + + def _probe() -> None: + while not ready_event.is_set() and proc.poll() is None: + if _server_is_up(api_base): + print("[server] ready (http probe)", flush=True) + ready_event.set() + return + time.sleep(2) + + threading.Thread(target=_probe, daemon=True).start() def _stream_output() -> None: # Read raw chunks instead of iterating lines so tqdm progress