From 8b9c598cf47377661636eb26dfa95a7b13257243 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 28 Apr 2026 16:39:50 +0200
Subject: [PATCH] feat(annotate): auto_serve mode spawns and tears down
 inference server

Setting --vlm.auto_serve=true with --vlm.backend=openai makes the CLI
launch 'transformers serve <model_id> --port <serve_port>
--continuous-batching' as a child process, poll /v1/models until ready
(up to serve_ready_timeout_s), run the pipeline, then SIGINT the
server on process exit.

Override the spawn command with --vlm.serve_command='vllm serve ...'
or any OpenAI-compatible launcher.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py  | 12 +++
 .../steerable_pipeline/vlm_client.py          | 78 ++++++++++++++++++-
 2 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 5dad83629..3126602be 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -83,6 +83,18 @@ class VlmConfig:
     """Base URL for the ``openai`` backend."""
     api_key: str = "EMPTY"
     """API key for the ``openai`` backend; ``EMPTY`` works for local servers."""
+    auto_serve: bool = False
+    """When True with ``backend=openai``, the CLI spawns the inference
+    server itself (default: ``transformers serve``), waits for it to be
+    ready, runs the pipeline, and tears it down on exit. Override the
+    command via ``serve_command``."""
+    serve_port: int = 8000
+    """Port the auto-spawned server binds to. Sets ``api_base`` automatically."""
+    serve_command: str | None = None
+    """Override the auto-serve command (full shell command). When ``None``,
+    we run ``transformers serve <model_id> --port <serve_port> --continuous-batching``."""
+    serve_ready_timeout_s: float = 600.0
+    """Max seconds to wait for the server to start serving requests."""
     max_new_tokens: int = 512
     temperature: float = 0.2
     json_mode: bool = True
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index 2c12d4780..b5503a6bb 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -257,8 +257,10 @@ def _make_openai_client(config: VlmConfig) -> VlmClient:
     """Backend that talks to any OpenAI-compatible server.
 
     Compatible with ``vllm serve``, ``transformers serve``,
-    ``ktransformers serve``, and hosted endpoints. The server is
-    expected to be already running and to host ``config.model_id``.
+    ``ktransformers serve``, and hosted endpoints. By default the server
+    is expected to be already running. Set ``auto_serve=True`` to have
+    this client spawn one (default: ``transformers serve``), wait until
+    it's ready, and tear it down on process exit.
 
     Image blocks ``{"type":"image", "image":<PIL.Image>}`` are
     auto-converted to ``image_url`` data-URLs. Video blocks
@@ -273,7 +275,11 @@ def _make_openai_client(config: VlmConfig) -> VlmClient:
             "Install with `pip install openai`."
         ) from exc
 
-    client = OpenAI(base_url=config.api_base, api_key=config.api_key)
+    api_base = config.api_base
+    if config.auto_serve:
+        api_base = _spawn_inference_server(config)
+
+    client = OpenAI(base_url=api_base, api_key=config.api_key)
 
     def _gen(
         batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float
@@ -293,6 +299,72 @@ def _make_openai_client(config: VlmConfig) -> VlmClient:
     return _GenericTextClient(_gen, config)
 
 
+def _spawn_inference_server(config: VlmConfig) -> str:
+    """Spawn ``transformers serve`` (or ``serve_command``), wait until it
+    accepts ``/v1/models``, and register a shutdown hook.
+
+    Returns the full ``api_base`` URL the OpenAI client should use.
+    """
+    import atexit  # noqa: PLC0415
+    import logging  # noqa: PLC0415
+    import shlex  # noqa: PLC0415
+    import signal  # noqa: PLC0415
+    import subprocess  # noqa: PLC0415
+    import time  # noqa: PLC0415
+    import urllib.request  # noqa: PLC0415
+
+    log = logging.getLogger(__name__)
+    cmd = config.serve_command
+    if not cmd:
+        cmd = (
+            f"transformers serve {shlex.quote(config.model_id)} "
+            f"--port {config.serve_port} --continuous-batching"
+        )
+    api_base = f"http://localhost:{config.serve_port}/v1"
+    log.info("auto_serve: launching: %s", cmd)
+    proc = subprocess.Popen(
+        shlex.split(cmd),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+    )
+
+    def _shutdown() -> None:
+        if proc.poll() is None:
+            log.info("auto_serve: stopping pid=%s", proc.pid)
+            proc.send_signal(signal.SIGINT)
+            try:
+                proc.wait(timeout=15)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait(timeout=5)
+
+    atexit.register(_shutdown)
+
+    deadline = time.monotonic() + config.serve_ready_timeout_s
+    health_url = api_base.rstrip("/") + "/models"
+    while time.monotonic() < deadline:
+        if proc.poll() is not None:
+            tail = proc.stdout.read() if proc.stdout else ""
+            raise RuntimeError(
+                f"auto_serve: inference server exited (rc={proc.returncode}). "
+                f"Tail of output:\n{tail}"
+            )
+        try:
+            with urllib.request.urlopen(health_url, timeout=2) as resp:
+                if resp.status == 200:
+                    log.info("auto_serve: server ready at %s", api_base)
+                    return api_base
+        except Exception:  # noqa: BLE001  - intentional broad except
+            pass
+        time.sleep(2)
+    proc.terminate()
+    raise RuntimeError(
+        f"auto_serve: server did not become ready within {config.serve_ready_timeout_s}s"
+    )
+
+
 def _to_openai_message(message: dict[str, Any]) -> dict[str, Any]:
     """Convert an internal message dict to OpenAI chat format.