mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-21 11:39:50 +00:00
fix(annotate): low_cpu_mem_usage=True on transformers load path
The std::bad_alloc we hit on Qwen3-line VL models is not a real OOM — it triggers in the post-load tensor-placement path even on hosts with 2 TB RAM. low_cpu_mem_usage=True bypasses the offending intermediate staging buffer and is the standard accelerate workaround. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -181,13 +181,16 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient:
|
|||||||
"for VL models."
|
"for VL models."
|
||||||
)
|
)
|
||||||
processor = AutoProcessor.from_pretrained(config.model_id)
|
processor = AutoProcessor.from_pretrained(config.model_id)
|
||||||
# device_map='auto' loads weights directly to GPU(s) and shards when
|
# ``low_cpu_mem_usage=True`` avoids a transformers-internal staging
|
||||||
# needed; without it, transformers stages the full checkpoint in CPU
|
# buffer that has caused std::bad_alloc on Qwen3-line architectures
|
||||||
# memory first which OOMs the host on FP8/large models.
|
# even on hosts with TBs of RAM (the failing alloc is in the
|
||||||
|
# post-load tensor-placement path, not a real OOM).
|
||||||
|
# ``device_map='auto'`` then streams shards directly to the GPU.
|
||||||
model = auto_cls.from_pretrained(
|
model = auto_cls.from_pretrained(
|
||||||
config.model_id,
|
config.model_id,
|
||||||
torch_dtype="auto",
|
torch_dtype="auto",
|
||||||
device_map="auto",
|
device_map="auto",
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
)
|
)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user