mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 19:19:56 +00:00
fix(profiling): fix pi0 cuBLAS error and pi05 OOM on 22GB GPU
- Move cudnn_deterministic to per-spec train_args instead of hardcoding it for all models. cuBLAS deterministic mode triggers internal errors on Gemma-based models (pi0, pi05) during backward pass. - Enable use_amp=true for pi0, pi0_fast, and pi05 to reduce memory footprint from fp32 (~16GB weights alone) to bf16, fitting within 22GB GPU budget with room for activations and gradients. - Small models (act, diffusion, multi_task_dit) still use deterministic mode for reproducible profiling results. Made-with: Cursor
This commit is contained in:
@@ -6,7 +6,8 @@
|
|||||||
"--dataset.episodes=[0]",
|
"--dataset.episodes=[0]",
|
||||||
"--policy.type=act",
|
"--policy.type=act",
|
||||||
"--policy.device=cuda",
|
"--policy.device=cuda",
|
||||||
"--batch_size=4"
|
"--batch_size=4",
|
||||||
|
"--cudnn_deterministic=true"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"diffusion": {
|
"diffusion": {
|
||||||
@@ -16,7 +17,8 @@
|
|||||||
"--dataset.episodes=[0]",
|
"--dataset.episodes=[0]",
|
||||||
"--policy.type=diffusion",
|
"--policy.type=diffusion",
|
||||||
"--policy.device=cuda",
|
"--policy.device=cuda",
|
||||||
"--batch_size=4"
|
"--batch_size=4",
|
||||||
|
"--cudnn_deterministic=true"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"groot": {
|
"groot": {
|
||||||
@@ -45,7 +47,8 @@
|
|||||||
"--policy.device=cuda",
|
"--policy.device=cuda",
|
||||||
"--policy.horizon=32",
|
"--policy.horizon=32",
|
||||||
"--policy.n_action_steps=30",
|
"--policy.n_action_steps=30",
|
||||||
"--batch_size=4"
|
"--batch_size=4",
|
||||||
|
"--cudnn_deterministic=true"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"pi0": {
|
"pi0": {
|
||||||
@@ -56,6 +59,7 @@
|
|||||||
"--policy.path=lerobot/pi0_base",
|
"--policy.path=lerobot/pi0_base",
|
||||||
"--policy.device=cuda",
|
"--policy.device=cuda",
|
||||||
"--policy.n_action_steps=30",
|
"--policy.n_action_steps=30",
|
||||||
|
"--policy.use_amp=true",
|
||||||
"--batch_size=1",
|
"--batch_size=1",
|
||||||
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
||||||
]
|
]
|
||||||
@@ -68,6 +72,7 @@
|
|||||||
"--policy.path=lerobot/pi0fast-base",
|
"--policy.path=lerobot/pi0fast-base",
|
||||||
"--policy.device=cuda",
|
"--policy.device=cuda",
|
||||||
"--policy.n_action_steps=30",
|
"--policy.n_action_steps=30",
|
||||||
|
"--policy.use_amp=true",
|
||||||
"--batch_size=1",
|
"--batch_size=1",
|
||||||
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
||||||
]
|
]
|
||||||
@@ -80,6 +85,7 @@
|
|||||||
"--policy.path=lerobot/pi05_base",
|
"--policy.path=lerobot/pi05_base",
|
||||||
"--policy.device=cuda",
|
"--policy.device=cuda",
|
||||||
"--policy.n_action_steps=30",
|
"--policy.n_action_steps=30",
|
||||||
|
"--policy.use_amp=true",
|
||||||
"--batch_size=1",
|
"--batch_size=1",
|
||||||
"--policy.normalization_mapping={\"ACTION\": \"MEAN_STD\", \"STATE\": \"MEAN_STD\", \"VISUAL\": \"IDENTITY\"}",
|
"--policy.normalization_mapping={\"ACTION\": \"MEAN_STD\", \"STATE\": \"MEAN_STD\", \"VISUAL\": \"IDENTITY\"}",
|
||||||
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
||||||
|
|||||||
@@ -160,7 +160,6 @@ def build_train_command(spec: ProfilingSpec, run_dir: Path, profile_mode: str) -
|
|||||||
"--policy.push_to_hub=false",
|
"--policy.push_to_hub=false",
|
||||||
"--num_workers=0",
|
"--num_workers=0",
|
||||||
"--log_freq=1",
|
"--log_freq=1",
|
||||||
"--cudnn_deterministic=true",
|
|
||||||
f"--profile_mode={profile_mode}",
|
f"--profile_mode={profile_mode}",
|
||||||
f"--profile_output_dir={profile_output_dir}",
|
f"--profile_output_dir={profile_output_dir}",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -103,7 +103,6 @@ def test_build_train_command_includes_profiling_outputs(tmp_path):
|
|||||||
assert any(arg.startswith("--profile_output_dir=") for arg in cmd)
|
assert any(arg.startswith("--profile_output_dir=") for arg in cmd)
|
||||||
assert "--profile_mode=trace" in cmd
|
assert "--profile_mode=trace" in cmd
|
||||||
assert "--eval_freq=0" in cmd
|
assert "--eval_freq=0" in cmd
|
||||||
assert "--cudnn_deterministic=true" in cmd
|
|
||||||
|
|
||||||
|
|
||||||
def test_build_artifact_index_collects_tables_and_traces(tmp_path):
|
def test_build_artifact_index_collects_tables_and_traces(tmp_path):
|
||||||
|
|||||||
Reference in New Issue
Block a user