From e21996f23b3775dd949b6c9913245b806ec44e43 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 28 Apr 2026 13:27:19 +0200 Subject: [PATCH] fix(annotate): default transformers backend to manual GPU placement Loading Qwen3-VL via transformers + accelerate's device_map='auto' fails with std::bad_alloc on hosts with abundant RAM. The bug is in accelerate's post-load dispatch path. Bypassing accelerate by loading to CPU first and then calling .to('cuda') manually avoids that path. LEROBOT_TRANSFORMERS_DEVICE_MAP=auto switches back to the old behavior for cases where it works. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../steerable_pipeline/vlm_client.py | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index 2406a2a2b..4f9df78a5 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -201,20 +201,32 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient: processor = AutoProcessor.from_pretrained( config.model_id, trust_remote_code=config.trust_remote_code ) - # ``low_cpu_mem_usage=True`` avoids a transformers-internal staging - # buffer that has caused std::bad_alloc on Qwen3-line architectures - # even on hosts with TBs of RAM (the failing alloc is in the - # post-load tensor-placement path, not a real OOM). - # ``device_map='auto'`` then streams shards directly to the GPU. - # ``trust_remote_code`` is required for many newer VL releases - # (Qwen3.6-FP8, etc.) that ship a custom loader in the repo. - model = auto_cls.from_pretrained( - config.model_id, - torch_dtype="auto", - device_map="auto", - low_cpu_mem_usage=True, - trust_remote_code=config.trust_remote_code, - ) + import os as _os # noqa: PLC0415 + + use_accelerate = _os.environ.get("LEROBOT_TRANSFORMERS_DEVICE_MAP", "manual") != "manual" + # ``device_map='auto'`` triggers a known std::bad_alloc on the Qwen3-VL + # post-load dispatch path (the alloc fails in accelerate's hook setup + # even with TBs of host RAM). Default to manual: load on CPU with + # ``low_cpu_mem_usage=True``, then ``.to("cuda")``. Set + # ``LEROBOT_TRANSFORMERS_DEVICE_MAP=auto`` to opt back into the old path. + if use_accelerate: + model = auto_cls.from_pretrained( + config.model_id, + torch_dtype="auto", + device_map="auto", + low_cpu_mem_usage=True, + trust_remote_code=config.trust_remote_code, + ) + else: + import torch as _torch # noqa: PLC0415 + + model = auto_cls.from_pretrained( + config.model_id, + torch_dtype=_torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=config.trust_remote_code, + ) + model = model.to("cuda") model.eval() def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]: