From dbe01b0444c633a258f84dcdce0dc888fb13bc3a Mon Sep 17 00:00:00 2001 From: Pepijn Date: Thu, 16 Apr 2026 15:34:04 +0200 Subject: [PATCH] fix(profiling): fix pi0 cuBLAS error and pi05 OOM on 22GB GPU - Move cudnn_deterministic to per-spec train_args instead of hardcoding it for all models. cuBLAS deterministic mode triggers internal errors on Gemma-based models (pi0, pi05) during backward pass. - Enable use_amp=true for pi0, pi0_fast, and pi05 to reduce memory footprint from fp32 (~16GB weights alone) to bf16, fitting within 22GB GPU budget with room for activations and gradients. - Small models (act, diffusion, multi_task_dit) still use deterministic mode for reproducible profiling results. Made-with: Cursor --- profiling/model_profiling_specs.json | 12 +++++++++--- scripts/ci/run_model_profiling.py | 1 - tests/scripts/test_model_profiling.py | 1 - 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/profiling/model_profiling_specs.json b/profiling/model_profiling_specs.json index 3f526882d..8774c993e 100644 --- a/profiling/model_profiling_specs.json +++ b/profiling/model_profiling_specs.json @@ -6,7 +6,8 @@ "--dataset.episodes=[0]", "--policy.type=act", "--policy.device=cuda", - "--batch_size=4" + "--batch_size=4", + "--cudnn_deterministic=true" ] }, "diffusion": { @@ -16,7 +17,8 @@ "--dataset.episodes=[0]", "--policy.type=diffusion", "--policy.device=cuda", - "--batch_size=4" + "--batch_size=4", + "--cudnn_deterministic=true" ] }, "groot": { @@ -45,7 +47,8 @@ "--policy.device=cuda", "--policy.horizon=32", "--policy.n_action_steps=30", - "--batch_size=4" + "--batch_size=4", + "--cudnn_deterministic=true" ] }, "pi0": { @@ -56,6 +59,7 @@ "--policy.path=lerobot/pi0_base", "--policy.device=cuda", "--policy.n_action_steps=30", + "--policy.use_amp=true", "--batch_size=1", "--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}" ] @@ -68,6 +72,7 @@ "--policy.path=lerobot/pi0fast-base", "--policy.device=cuda", "--policy.n_action_steps=30", + "--policy.use_amp=true", "--batch_size=1", "--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}" ] @@ -80,6 +85,7 @@ "--policy.path=lerobot/pi05_base", "--policy.device=cuda", "--policy.n_action_steps=30", + "--policy.use_amp=true", "--batch_size=1", "--policy.normalization_mapping={\"ACTION\": \"MEAN_STD\", \"STATE\": \"MEAN_STD\", \"VISUAL\": \"IDENTITY\"}", "--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}" diff --git a/scripts/ci/run_model_profiling.py b/scripts/ci/run_model_profiling.py index e63f1b46f..0ae32db2a 100644 --- a/scripts/ci/run_model_profiling.py +++ b/scripts/ci/run_model_profiling.py @@ -160,7 +160,6 @@ def build_train_command(spec: ProfilingSpec, run_dir: Path, profile_mode: str) - "--policy.push_to_hub=false", "--num_workers=0", "--log_freq=1", - "--cudnn_deterministic=true", f"--profile_mode={profile_mode}", f"--profile_output_dir={profile_output_dir}", ] diff --git a/tests/scripts/test_model_profiling.py b/tests/scripts/test_model_profiling.py index 9e3e8a1f6..be11dab49 100644 --- a/tests/scripts/test_model_profiling.py +++ b/tests/scripts/test_model_profiling.py @@ -103,7 +103,6 @@ def test_build_train_command_includes_profiling_outputs(tmp_path): assert any(arg.startswith("--profile_output_dir=") for arg in cmd) assert "--profile_mode=trace" in cmd assert "--eval_freq=0" in cmd - assert "--cudnn_deterministic=true" in cmd def test_build_artifact_index_collects_tables_and_traces(tmp_path):