mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 11:09:59 +00:00
fix(annotate): default transformers backend to manual GPU placement
Loading Qwen3-VL via transformers + accelerate's device_map='auto'
fails with std::bad_alloc on hosts with abundant RAM. The bug is in
accelerate's post-load dispatch path. Bypassing accelerate by loading
to CPU first and then calling .to('cuda') manually avoids that path.
LEROBOT_TRANSFORMERS_DEVICE_MAP=auto switches back to the old behavior
for cases where it works.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -201,20 +201,32 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient:
|
|||||||
processor = AutoProcessor.from_pretrained(
|
processor = AutoProcessor.from_pretrained(
|
||||||
config.model_id, trust_remote_code=config.trust_remote_code
|
config.model_id, trust_remote_code=config.trust_remote_code
|
||||||
)
|
)
|
||||||
# ``low_cpu_mem_usage=True`` avoids a transformers-internal staging
|
import os as _os # noqa: PLC0415
|
||||||
# buffer that has caused std::bad_alloc on Qwen3-line architectures
|
|
||||||
# even on hosts with TBs of RAM (the failing alloc is in the
|
use_accelerate = _os.environ.get("LEROBOT_TRANSFORMERS_DEVICE_MAP", "manual") != "manual"
|
||||||
# post-load tensor-placement path, not a real OOM).
|
# ``device_map='auto'`` triggers a known std::bad_alloc on the Qwen3-VL
|
||||||
# ``device_map='auto'`` then streams shards directly to the GPU.
|
# post-load dispatch path (the alloc fails in accelerate's hook setup
|
||||||
# ``trust_remote_code`` is required for many newer VL releases
|
# even with TBs of host RAM). Default to manual: load on CPU with
|
||||||
# (Qwen3.6-FP8, etc.) that ship a custom loader in the repo.
|
# ``low_cpu_mem_usage=True``, then ``.to("cuda")``. Set
|
||||||
model = auto_cls.from_pretrained(
|
# ``LEROBOT_TRANSFORMERS_DEVICE_MAP=auto`` to opt back into the old path.
|
||||||
config.model_id,
|
if use_accelerate:
|
||||||
torch_dtype="auto",
|
model = auto_cls.from_pretrained(
|
||||||
device_map="auto",
|
config.model_id,
|
||||||
low_cpu_mem_usage=True,
|
torch_dtype="auto",
|
||||||
trust_remote_code=config.trust_remote_code,
|
device_map="auto",
|
||||||
)
|
low_cpu_mem_usage=True,
|
||||||
|
trust_remote_code=config.trust_remote_code,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
import torch as _torch # noqa: PLC0415
|
||||||
|
|
||||||
|
model = auto_cls.from_pretrained(
|
||||||
|
config.model_id,
|
||||||
|
torch_dtype=_torch.bfloat16,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
trust_remote_code=config.trust_remote_code,
|
||||||
|
)
|
||||||
|
model = model.to("cuda")
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]:
|
def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]:
|
||||||
|
|||||||
Reference in New Issue
Block a user