From f8c00d9ca51df9437acd96195946c99e18a638ce Mon Sep 17 00:00:00 2001 From: Pepijn Date: Mon, 27 Apr 2026 23:05:14 +0200 Subject: [PATCH] fix(annotate): use device_map='auto' for transformers backend Without device_map, transformers stages the full FP8 checkpoint in CPU RAM before any GPU placement, OOMing the host on 27B+ models even when the GPU has enough VRAM. device_map='auto' streams shards directly to GPU memory. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lerobot/annotations/steerable_pipeline/vlm_client.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index 734e33ffd..edb62f297 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -181,7 +181,14 @@ def _make_transformers_client(config: VlmConfig) -> VlmClient: "for VL models." ) processor = AutoProcessor.from_pretrained(config.model_id) - model = auto_cls.from_pretrained(config.model_id, torch_dtype="auto") + # device_map='auto' loads weights directly to GPU(s) and shards when + # needed; without it, transformers stages the full checkpoint in CPU + # memory first which OOMs the host on FP8/large models. + model = auto_cls.from_pretrained( + config.model_id, + torch_dtype="auto", + device_map="auto", + ) model.eval() def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]: