From 10fa65a996f982083070fc51879cbb3279221e96 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 28 Apr 2026 12:05:00 +0200
Subject: [PATCH] fix(annotate): LEROBOT_DISABLE_CUDNN escape hatch for conv3d
 crash

cuDNN 9.x + torch 2.8 has a regression where the conv3d kernel used in
Qwen-VL vision tower patch embedders fails with
CUDNN_STATUS_NOT_INITIALIZED. The crash is independent of model size
and reproduces on both Qwen2.5-VL and Qwen3-VL because both use 3D conv
for video patch embedding.

Setting LEROBOT_DISABLE_CUDNN=1 falls back to native PyTorch conv3d
kernels (slower but functional) so the pipeline can run while the
torch/cuDNN stack is still on the broken combo.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/vlm_client.py       | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index 9ac40ec4f..2406a2a2b 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -148,6 +148,16 @@ def _make_vllm_client(config: VlmConfig) -> VlmClient:
         raise ImportError(
             "vllm is required for backend='vllm'. Install with `pip install lerobot[annotations]`."
         ) from exc
+    # Workaround for cuDNN 9.x + torch 2.8 conv3d regression that surfaces
+    # as CUDNN_STATUS_NOT_INITIALIZED in Qwen-VL vision-tower patch
+    # embedders. Setting LEROBOT_DISABLE_CUDNN=1 forces native PyTorch
+    # convolution kernels — slower but functional.
+    import os as _os  # noqa: PLC0415
+
+    if _os.environ.get("LEROBOT_DISABLE_CUDNN", "").lower() in {"1", "true", "yes"}:
+        import torch as _torch  # noqa: PLC0415
+
+        _torch.backends.cudnn.enabled = False
     llm_kwargs: dict[str, Any] = {
         "model": config.model_id,
         "tensor_parallel_size": config.tensor_parallel_size,