From ef1242bbd4a2904173ce0f327832910ba8e5d8c3 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 28 Apr 2026 11:29:22 +0200
Subject: [PATCH] fix(annotate): expose gpu_memory_utilization and
 max_model_len for vllm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Large VL models (Qwen3-VL-30B-A3B BF16) take ~58 GB of an 80 GB H100,
leaving only ~22 GB for KV cache + cuDNN workspace. The vision tower's
3D conv then fails with CUDNN_STATUS_NOT_INITIALIZED because cuDNN
can't grab a workspace large enough.

- vlm.gpu_memory_utilization (default 0.9) — drop to 0.7 when the vision
  encoder needs more cuDNN workspace.
- vlm.max_model_len — cap context to free KV cache memory; the 262k
  default for Qwen3 is wildly more than annotation prompts need.
- vlm.trust_remote_code — already plumbed; now also passed to LLM().

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/lerobot/annotations/steerable_pipeline/config.py   |  8 ++++++++
 .../annotations/steerable_pipeline/vlm_client.py       | 10 +++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 0ea8240a5..07326170e 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -68,6 +68,14 @@ class VlmConfig:
     json_mode: bool = True
     batch_size: int = 4
     tensor_parallel_size: int = 1
+    gpu_memory_utilization: float = 0.9
+    """Fraction of GPU memory vllm allocates for weights + KV cache.
+    Lower (e.g. 0.7) when the vision encoder needs cuDNN workspace, or to
+    avoid CUDNN_STATUS_NOT_INITIALIZED on tight VRAM (30B BF16 on 80 GB)."""
+    max_model_len: int | None = None
+    """Cap context length. ``None`` keeps the model's default; on H100 80 GB
+    a 30B BF16 model often needs ``max_model_len=8192`` or smaller to leave
+    room for KV cache."""
     trust_remote_code: bool = True
     """Pass ``trust_remote_code`` to HF auto-classes. Required for many
     newer VL checkpoints (Qwen3.x FP8, etc.) that ship custom loader code."""
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index 67017976c..9ac40ec4f 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -148,7 +148,15 @@ def _make_vllm_client(config: VlmConfig) -> VlmClient:
         raise ImportError(
             "vllm is required for backend='vllm'. Install with `pip install lerobot[annotations]`."
         ) from exc
-    llm = LLM(model=config.model_id, tensor_parallel_size=config.tensor_parallel_size)
+    llm_kwargs: dict[str, Any] = {
+        "model": config.model_id,
+        "tensor_parallel_size": config.tensor_parallel_size,
+        "gpu_memory_utilization": config.gpu_memory_utilization,
+        "trust_remote_code": config.trust_remote_code,
+    }
+    if config.max_model_len is not None:
+        llm_kwargs["max_model_len"] = config.max_model_len
+    llm = LLM(**llm_kwargs)
 
     def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]:
         params = SamplingParams(