From da7da741f14fbf08da71f0e6e507d6d168ade663 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 16 Apr 2026 16:09:56 +0200
Subject: [PATCH] fix(profiling): use SGD for pi0/pi05/pi0_fast and free CUDA
 cache after deterministic forward

Adam optimizer states (exp_avg + exp_avg_sq) require ~16GB extra on top of
model params and gradients for 4B parameter models, exceeding the 22GB GPU.
SGD has zero optimizer state overhead and profiling only measures
forward/backward timing anyway.

Also adds torch.cuda.empty_cache() after deterministic forward to release
transient memory before the training loop starts.

Made-with: Cursor
---
 profiling/model_profiling_specs.json | 30 ++++++++++++++++++++++++++++
 src/lerobot/utils/profiling_utils.py |  2 ++
 2 files changed, 32 insertions(+)

diff --git a/profiling/model_profiling_specs.json b/profiling/model_profiling_specs.json
index 1d782fc3d..f30337663 100644
--- a/profiling/model_profiling_specs.json
+++ b/profiling/model_profiling_specs.json
@@ -63,6 +63,16 @@
       "--policy.use_amp=true",
       "--policy.gradient_checkpointing=true",
       "--batch_size=1",
+      "--use_policy_training_preset=false",
+      "--optimizer.type=sgd",
+      "--optimizer.lr=1e-5",
+      "--optimizer.weight_decay=0",
+      "--optimizer.grad_clip_norm=1.0",
+      "--scheduler.type=cosine_decay_with_warmup",
+      "--scheduler.peak_lr=1e-5",
+      "--scheduler.decay_lr=1e-6",
+      "--scheduler.num_warmup_steps=0",
+      "--scheduler.num_decay_steps=12",
       "--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
     ]
   },
@@ -78,6 +88,16 @@
       "--policy.use_amp=true",
       "--policy.gradient_checkpointing=true",
       "--batch_size=1",
+      "--use_policy_training_preset=false",
+      "--optimizer.type=sgd",
+      "--optimizer.lr=1e-5",
+      "--optimizer.weight_decay=0",
+      "--optimizer.grad_clip_norm=1.0",
+      "--scheduler.type=cosine_decay_with_warmup",
+      "--scheduler.peak_lr=1e-5",
+      "--scheduler.decay_lr=1e-6",
+      "--scheduler.num_warmup_steps=0",
+      "--scheduler.num_decay_steps=12",
       "--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
     ]
   },
@@ -93,6 +113,16 @@
       "--policy.use_amp=true",
       "--policy.gradient_checkpointing=true",
       "--batch_size=1",
+      "--use_policy_training_preset=false",
+      "--optimizer.type=sgd",
+      "--optimizer.lr=1e-5",
+      "--optimizer.weight_decay=0",
+      "--optimizer.grad_clip_norm=1.0",
+      "--scheduler.type=cosine_decay_with_warmup",
+      "--scheduler.peak_lr=1e-5",
+      "--scheduler.decay_lr=1e-6",
+      "--scheduler.num_warmup_steps=0",
+      "--scheduler.num_decay_steps=12",
       "--policy.normalization_mapping={\"ACTION\": \"MEAN_STD\", \"STATE\": \"MEAN_STD\", \"VISUAL\": \"IDENTITY\"}",
       "--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
     ]
diff --git a/src/lerobot/utils/profiling_utils.py b/src/lerobot/utils/profiling_utils.py
index 5cfc66f3b..85931051d 100644
--- a/src/lerobot/utils/profiling_utils.py
+++ b/src/lerobot/utils/profiling_utils.py
@@ -343,6 +343,8 @@ class TrainingProfiler:
             output_dir=self._output_dir,
             device_type=self._device.type,
         )
+        if self._device.type == "cuda":
+            torch.cuda.empty_cache()
 
     def __enter__(self) -> TrainingProfiler:
         if self._device.type == "cuda":