From 8b9c598cf47377661636eb26dfa95a7b13257243 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 28 Apr 2026 16:39:50 +0200 Subject: [PATCH] feat(annotate): auto_serve mode spawns and tears down inference server Setting --vlm.auto_serve=true with --vlm.backend=openai makes the CLI launch 'transformers serve --port --continuous-batching' as a child process, poll /v1/models until ready (up to serve_ready_timeout_s), run the pipeline, then SIGINT the server on process exit. Override the spawn command with --vlm.serve_command='vllm serve ...' or any OpenAI-compatible launcher. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../annotations/steerable_pipeline/config.py | 12 +++ .../steerable_pipeline/vlm_client.py | 78 ++++++++++++++++++- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 5dad83629..3126602be 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -83,6 +83,18 @@ class VlmConfig: """Base URL for the ``openai`` backend.""" api_key: str = "EMPTY" """API key for the ``openai`` backend; ``EMPTY`` works for local servers.""" + auto_serve: bool = False + """When True with ``backend=openai``, the CLI spawns the inference + server itself (default: ``transformers serve``), waits for it to be + ready, runs the pipeline, and tears it down on exit. Override the + command via ``serve_command``.""" + serve_port: int = 8000 + """Port the auto-spawned server binds to. Sets ``api_base`` automatically.""" + serve_command: str | None = None + """Override the auto-serve command (full shell command). When ``None``, + we run ``transformers serve --port --continuous-batching``.""" + serve_ready_timeout_s: float = 600.0 + """Max seconds to wait for the server to start serving requests.""" max_new_tokens: int = 512 temperature: float = 0.2 json_mode: bool = True diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index 2c12d4780..b5503a6bb 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -257,8 +257,10 @@ def _make_openai_client(config: VlmConfig) -> VlmClient: """Backend that talks to any OpenAI-compatible server. Compatible with ``vllm serve``, ``transformers serve``, - ``ktransformers serve``, and hosted endpoints. The server is - expected to be already running and to host ``config.model_id``. + ``ktransformers serve``, and hosted endpoints. By default the server + is expected to be already running. Set ``auto_serve=True`` to have + this client spawn one (default: ``transformers serve``), wait until + it's ready, and tear it down on process exit. Image blocks ``{"type":"image", "image":}`` are auto-converted to ``image_url`` data-URLs. Video blocks @@ -273,7 +275,11 @@ def _make_openai_client(config: VlmConfig) -> VlmClient: "Install with `pip install openai`." ) from exc - client = OpenAI(base_url=config.api_base, api_key=config.api_key) + api_base = config.api_base + if config.auto_serve: + api_base = _spawn_inference_server(config) + + client = OpenAI(base_url=api_base, api_key=config.api_key) def _gen( batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float @@ -293,6 +299,72 @@ def _make_openai_client(config: VlmConfig) -> VlmClient: return _GenericTextClient(_gen, config) +def _spawn_inference_server(config: VlmConfig) -> str: + """Spawn ``transformers serve`` (or ``serve_command``), wait until it + accepts ``/v1/models``, and register a shutdown hook. + + Returns the full ``api_base`` URL the OpenAI client should use. + """ + import atexit # noqa: PLC0415 + import logging # noqa: PLC0415 + import shlex # noqa: PLC0415 + import signal # noqa: PLC0415 + import subprocess # noqa: PLC0415 + import time # noqa: PLC0415 + import urllib.request # noqa: PLC0415 + + log = logging.getLogger(__name__) + cmd = config.serve_command + if not cmd: + cmd = ( + f"transformers serve {shlex.quote(config.model_id)} " + f"--port {config.serve_port} --continuous-batching" + ) + api_base = f"http://localhost:{config.serve_port}/v1" + log.info("auto_serve: launching: %s", cmd) + proc = subprocess.Popen( + shlex.split(cmd), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + def _shutdown() -> None: + if proc.poll() is None: + log.info("auto_serve: stopping pid=%s", proc.pid) + proc.send_signal(signal.SIGINT) + try: + proc.wait(timeout=15) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=5) + + atexit.register(_shutdown) + + deadline = time.monotonic() + config.serve_ready_timeout_s + health_url = api_base.rstrip("/") + "/models" + while time.monotonic() < deadline: + if proc.poll() is not None: + tail = proc.stdout.read() if proc.stdout else "" + raise RuntimeError( + f"auto_serve: inference server exited (rc={proc.returncode}). " + f"Tail of output:\n{tail}" + ) + try: + with urllib.request.urlopen(health_url, timeout=2) as resp: + if resp.status == 200: + log.info("auto_serve: server ready at %s", api_base) + return api_base + except Exception: # noqa: BLE001 - intentional broad except + pass + time.sleep(2) + proc.terminate() + raise RuntimeError( + f"auto_serve: server did not become ready within {config.serve_ready_timeout_s}s" + ) + + def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]: """Convert an internal message dict to OpenAI chat format.