mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 11:09:59 +00:00
fix(profiling): use SGD for pi0/pi05/pi0_fast and free CUDA cache after deterministic forward
Adam optimizer states (exp_avg + exp_avg_sq) require ~16GB extra on top of model params and gradients for 4B parameter models, exceeding the 22GB GPU. SGD has zero optimizer state overhead and profiling only measures forward/backward timing anyway. Also adds torch.cuda.empty_cache() after deterministic forward to release transient memory before the training loop starts. Made-with: Cursor
This commit is contained in:
@@ -63,6 +63,16 @@
|
|||||||
"--policy.use_amp=true",
|
"--policy.use_amp=true",
|
||||||
"--policy.gradient_checkpointing=true",
|
"--policy.gradient_checkpointing=true",
|
||||||
"--batch_size=1",
|
"--batch_size=1",
|
||||||
|
"--use_policy_training_preset=false",
|
||||||
|
"--optimizer.type=sgd",
|
||||||
|
"--optimizer.lr=1e-5",
|
||||||
|
"--optimizer.weight_decay=0",
|
||||||
|
"--optimizer.grad_clip_norm=1.0",
|
||||||
|
"--scheduler.type=cosine_decay_with_warmup",
|
||||||
|
"--scheduler.peak_lr=1e-5",
|
||||||
|
"--scheduler.decay_lr=1e-6",
|
||||||
|
"--scheduler.num_warmup_steps=0",
|
||||||
|
"--scheduler.num_decay_steps=12",
|
||||||
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -78,6 +88,16 @@
|
|||||||
"--policy.use_amp=true",
|
"--policy.use_amp=true",
|
||||||
"--policy.gradient_checkpointing=true",
|
"--policy.gradient_checkpointing=true",
|
||||||
"--batch_size=1",
|
"--batch_size=1",
|
||||||
|
"--use_policy_training_preset=false",
|
||||||
|
"--optimizer.type=sgd",
|
||||||
|
"--optimizer.lr=1e-5",
|
||||||
|
"--optimizer.weight_decay=0",
|
||||||
|
"--optimizer.grad_clip_norm=1.0",
|
||||||
|
"--scheduler.type=cosine_decay_with_warmup",
|
||||||
|
"--scheduler.peak_lr=1e-5",
|
||||||
|
"--scheduler.decay_lr=1e-6",
|
||||||
|
"--scheduler.num_warmup_steps=0",
|
||||||
|
"--scheduler.num_decay_steps=12",
|
||||||
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -93,6 +113,16 @@
|
|||||||
"--policy.use_amp=true",
|
"--policy.use_amp=true",
|
||||||
"--policy.gradient_checkpointing=true",
|
"--policy.gradient_checkpointing=true",
|
||||||
"--batch_size=1",
|
"--batch_size=1",
|
||||||
|
"--use_policy_training_preset=false",
|
||||||
|
"--optimizer.type=sgd",
|
||||||
|
"--optimizer.lr=1e-5",
|
||||||
|
"--optimizer.weight_decay=0",
|
||||||
|
"--optimizer.grad_clip_norm=1.0",
|
||||||
|
"--scheduler.type=cosine_decay_with_warmup",
|
||||||
|
"--scheduler.peak_lr=1e-5",
|
||||||
|
"--scheduler.decay_lr=1e-6",
|
||||||
|
"--scheduler.num_warmup_steps=0",
|
||||||
|
"--scheduler.num_decay_steps=12",
|
||||||
"--policy.normalization_mapping={\"ACTION\": \"MEAN_STD\", \"STATE\": \"MEAN_STD\", \"VISUAL\": \"IDENTITY\"}",
|
"--policy.normalization_mapping={\"ACTION\": \"MEAN_STD\", \"STATE\": \"MEAN_STD\", \"VISUAL\": \"IDENTITY\"}",
|
||||||
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
"--rename_map={\"observation.images.front\": \"observation.images.base_0_rgb\", \"observation.images.wrist\": \"observation.images.left_wrist_0_rgb\"}"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -343,6 +343,8 @@ class TrainingProfiler:
|
|||||||
output_dir=self._output_dir,
|
output_dir=self._output_dir,
|
||||||
device_type=self._device.type,
|
device_type=self._device.type,
|
||||||
)
|
)
|
||||||
|
if self._device.type == "cuda":
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
def __enter__(self) -> TrainingProfiler:
|
def __enter__(self) -> TrainingProfiler:
|
||||||
if self._device.type == "cuda":
|
if self._device.type == "cuda":
|
||||||
|
|||||||
Reference in New Issue
Block a user