From a2bacb2f762a9ee5438d1e0bfc9b68c7703ba27c Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 27 Apr 2026 23:29:20 +0200 Subject: [PATCH] fix(annotate): low_cpu_mem_usage=True on transformers load path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The std::bad_alloc we hit on Qwen3-line VL models is not a real OOM — it triggers in the post-load tensor-placement path even on hosts with 2 TB RAM. low_cpu_mem_usage=True bypasses the offending intermediate staging buffer and is the standard accelerate workaround. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lerobot/annotations/steerable_pipeline/vlm_client.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index edb62f297..cb7264446 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -181,13 +181,16 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient: "for VL models." ) processor = AutoProcessor.from_pretrained(config.model_id) - # device_map='auto' loads weights directly to GPU(s) and shards when - # needed; without it, transformers stages the full checkpoint in CPU - # memory first which OOMs the host on FP8/large models. + # ``low_cpu_mem_usage=True`` avoids a transformers-internal staging + # buffer that has caused std::bad_alloc on Qwen3-line architectures + # even on hosts with TBs of RAM (the failing alloc is in the + # post-load tensor-placement path, not a real OOM). + # ``device_map='auto'`` then streams shards directly to the GPU. model = auto_cls.from_pretrained( config.model_id, torch_dtype="auto", device_map="auto", + low_cpu_mem_usage=True, ) model.eval()