mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 20:19:43 +00:00
cleanup
This commit is contained in:
@@ -22,7 +22,6 @@ You can specify all parameters directly in the command without running `accelera
|
|||||||
accelerate launch \
|
accelerate launch \
|
||||||
--multi_gpu \
|
--multi_gpu \
|
||||||
--num_processes=2 \
|
--num_processes=2 \
|
||||||
--mixed_precision=fp16 \
|
|
||||||
$(which lerobot-train) \
|
$(which lerobot-train) \
|
||||||
--dataset.repo_id=${HF_USER}/my_dataset \
|
--dataset.repo_id=${HF_USER}/my_dataset \
|
||||||
--policy.type=act \
|
--policy.type=act \
|
||||||
@@ -75,10 +74,6 @@ When you launch training with accelerate:
|
|||||||
3. **Gradient synchronization**: Gradients are synchronized across GPUs during backpropagation
|
3. **Gradient synchronization**: Gradients are synchronized across GPUs during backpropagation
|
||||||
4. **Single process logging**: Only the main process logs to wandb and saves checkpoints
|
4. **Single process logging**: Only the main process logs to wandb and saves checkpoints
|
||||||
|
|
||||||
## Mixed Precision Training
|
|
||||||
|
|
||||||
For faster training, you can enable mixed precision (fp16 or bf16). This is configured during `accelerate config` or by passing `--mixed_precision=fp16` to `accelerate launch`. LeRobot's `use_amp` setting is automatically handled when using accelerate.
|
|
||||||
|
|
||||||
## Learning Rate and Training Steps Scaling
|
## Learning Rate and Training Steps Scaling
|
||||||
|
|
||||||
**Important:** LeRobot does **NOT** automatically scale learning rates or training steps based on the number of GPUs. This gives you full control over your training hyperparameters.
|
**Important:** LeRobot does **NOT** automatically scale learning rates or training steps based on the number of GPUs. This gives you full control over your training hyperparameters.
|
||||||
|
|||||||
@@ -145,8 +145,14 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
|
|||||||
# It will automatically detect if running in distributed mode or single-process mode
|
# It will automatically detect if running in distributed mode or single-process mode
|
||||||
# We set step_scheduler_with_optimizer=False to prevent accelerate from adjusting
|
# We set step_scheduler_with_optimizer=False to prevent accelerate from adjusting
|
||||||
# the lr_scheduler steps based on the num_processes
|
# the lr_scheduler steps based on the num_processes
|
||||||
|
# We set find_unused_parameters=True to handle models with conditional computation paths
|
||||||
if accelerator is None:
|
if accelerator is None:
|
||||||
accelerator = Accelerator(step_scheduler_with_optimizer=False)
|
from accelerate.utils import DistributedDataParallelKwargs
|
||||||
|
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
|
||||||
|
accelerator = Accelerator(
|
||||||
|
step_scheduler_with_optimizer=False,
|
||||||
|
kwargs_handlers=[ddp_kwargs]
|
||||||
|
)
|
||||||
|
|
||||||
# Determine if this is the main process (for logging and checkpointing)
|
# Determine if this is the main process (for logging and checkpointing)
|
||||||
# When using accelerate, only the main process should log to avoid duplicate outputs
|
# When using accelerate, only the main process should log to avoid duplicate outputs
|
||||||
|
|||||||
Reference in New Issue
Block a user