diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index 0c25c5cf8..8c85f2736 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -276,8 +276,18 @@ def _make_openai_client(config: VlmConfig) -> VlmClient: ) from exc api_base = config.api_base - if config.auto_serve and not _server_is_up(api_base): - api_base = _spawn_inference_server(config) + print( + f"[lerobot-annotate] backend=openai model={config.model_id} " + f"api_base={api_base} auto_serve={config.auto_serve}", + flush=True, + ) + if config.auto_serve: + if _server_is_up(api_base): + print(f"[lerobot-annotate] reusing server already up at {api_base}", flush=True) + else: + print("[lerobot-annotate] no server reachable; spawning one", flush=True) + api_base = _spawn_inference_server(config) + print(f"[lerobot-annotate] server ready at {api_base}", flush=True) client = OpenAI(base_url=api_base, api_key=config.api_key) @@ -315,17 +325,21 @@ def _spawn_inference_server(config: VlmConfig) -> str: """Spawn ``transformers serve`` (or ``serve_command``), wait until it accepts ``/v1/models``, and register a shutdown hook. + Streams the server's stdout/stderr to the parent terminal in + real-time on a background thread so users can see model-load + progress and errors as they happen. + Returns the full ``api_base`` URL the OpenAI client should use. """ import atexit # noqa: PLC0415 - import logging # noqa: PLC0415 import shlex # noqa: PLC0415 import signal # noqa: PLC0415 import subprocess # noqa: PLC0415 + import sys # noqa: PLC0415 + import threading # noqa: PLC0415 import time # noqa: PLC0415 import urllib.request # noqa: PLC0415 - log = logging.getLogger(__name__) cmd = config.serve_command if not cmd: cmd = ( @@ -333,7 +347,7 @@ def _spawn_inference_server(config: VlmConfig) -> str: f"--port {config.serve_port} --continuous-batching" ) api_base = f"http://localhost:{config.serve_port}/v1" - log.info("auto_serve: launching: %s", cmd) + print(f"[server] launching: {cmd}", flush=True) proc = subprocess.Popen( shlex.split(cmd), stdout=subprocess.PIPE, @@ -342,9 +356,17 @@ def _spawn_inference_server(config: VlmConfig) -> str: bufsize=1, ) + def _stream_output() -> None: + assert proc.stdout is not None + for line in proc.stdout: + sys.stdout.write(f"[server] {line}") + sys.stdout.flush() + + threading.Thread(target=_stream_output, daemon=True).start() + def _shutdown() -> None: if proc.poll() is None: - log.info("auto_serve: stopping pid=%s", proc.pid) + print(f"[server] stopping pid={proc.pid}", flush=True) proc.send_signal(signal.SIGINT) try: proc.wait(timeout=15) @@ -358,22 +380,20 @@ def _spawn_inference_server(config: VlmConfig) -> str: health_url = api_base.rstrip("/") + "/models" while time.monotonic() < deadline: if proc.poll() is not None: - tail = proc.stdout.read() if proc.stdout else "" raise RuntimeError( - f"auto_serve: inference server exited (rc={proc.returncode}). " - f"Tail of output:\n{tail}" + f"[server] inference server exited unexpectedly with rc={proc.returncode}. " + f"See [server] log lines above for the cause." ) try: with urllib.request.urlopen(health_url, timeout=2) as resp: if resp.status == 200: - log.info("auto_serve: server ready at %s", api_base) return api_base except Exception: # noqa: BLE001 - intentional broad except pass time.sleep(2) proc.terminate() raise RuntimeError( - f"auto_serve: server did not become ready within {config.serve_ready_timeout_s}s" + f"[server] did not become ready within {config.serve_ready_timeout_s}s" )