From 4170d1b6f182f5fe3a2b3021d2f3373a4dada1c2 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 14 Oct 2025 14:48:18 +0200
Subject: [PATCH] cleanup

---
 docs/source/multi_gpu_training.mdx   | 5 -----
 src/lerobot/scripts/lerobot_train.py | 8 +++++++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx
index 5d8319acb..a1318e772 100644
--- a/docs/source/multi_gpu_training.mdx
+++ b/docs/source/multi_gpu_training.mdx
@@ -22,7 +22,6 @@ You can specify all parameters directly in the command without running `accelera
 accelerate launch \
   --multi_gpu \
   --num_processes=2 \
-  --mixed_precision=fp16 \
   $(which lerobot-train) \
   --dataset.repo_id=${HF_USER}/my_dataset \
   --policy.type=act \
@@ -75,10 +74,6 @@ When you launch training with accelerate:
 3. **Gradient synchronization**: Gradients are synchronized across GPUs during backpropagation
 4. **Single process logging**: Only the main process logs to wandb and saves checkpoints
 
-## Mixed Precision Training
-
-For faster training, you can enable mixed precision (fp16 or bf16). This is configured during `accelerate config` or by passing `--mixed_precision=fp16` to `accelerate launch`. LeRobot's `use_amp` setting is automatically handled when using accelerate.
-
 ## Learning Rate and Training Steps Scaling
 
 **Important:** LeRobot does **NOT** automatically scale learning rates or training steps based on the number of GPUs. This gives you full control over your training hyperparameters.
diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py
index baf52c400..04b837269 100644
--- a/src/lerobot/scripts/lerobot_train.py
+++ b/src/lerobot/scripts/lerobot_train.py
@@ -145,8 +145,14 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
     # It will automatically detect if running in distributed mode or single-process mode
     # We set step_scheduler_with_optimizer=False to prevent accelerate from adjusting
     # the lr_scheduler steps based on the num_processes
+    # We set find_unused_parameters=True to handle models with conditional computation paths
     if accelerator is None:
-        accelerator = Accelerator(step_scheduler_with_optimizer=False)
+        from accelerate.utils import DistributedDataParallelKwargs
+        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+        accelerator = Accelerator(
+            step_scheduler_with_optimizer=False,
+            kwargs_handlers=[ddp_kwargs]
+        )
 
     # Determine if this is the main process (for logging and checkpointing)
     # When using accelerate, only the main process should log to avoid duplicate outputs