diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 0ea8240a5..07326170e 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -68,6 +68,14 @@ class VlmConfig: json_mode: bool = True batch_size: int = 4 tensor_parallel_size: int = 1 + gpu_memory_utilization: float = 0.9 + """Fraction of GPU memory vllm allocates for weights + KV cache. + Lower (e.g. 0.7) when the vision encoder needs cuDNN workspace, or to + avoid CUDNN_STATUS_NOT_INITIALIZED on tight VRAM (30B BF16 on 80 GB).""" + max_model_len: int | None = None + """Cap context length. ``None`` keeps the model's default; on H100 80 GB + a 30B BF16 model often needs ``max_model_len=8192`` or smaller to leave + room for KV cache.""" trust_remote_code: bool = True """Pass ``trust_remote_code`` to HF auto-classes. Required for many newer VL checkpoints (Qwen3.x FP8, etc.) that ship custom loader code.""" diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index 67017976c..9ac40ec4f 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -148,7 +148,15 @@ def _make_vllm_client(config: VlmConfig) -> VlmClient: raise ImportError( "vllm is required for backend='vllm'. Install with `pip install lerobot[annotations]`." ) from exc - llm = LLM(model=config.model_id, tensor_parallel_size=config.tensor_parallel_size) + llm_kwargs: dict[str, Any] = { + "model": config.model_id, + "tensor_parallel_size": config.tensor_parallel_size, + "gpu_memory_utilization": config.gpu_memory_utilization, + "trust_remote_code": config.trust_remote_code, + } + if config.max_model_len is not None: + llm_kwargs["max_model_len"] = config.max_model_len + llm = LLM(**llm_kwargs) def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]: params = SamplingParams(