From 50ff388bf68a073f753cdf571a572aed7199ddf9 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 14 Oct 2025 13:31:52 +0200 Subject: [PATCH] update docs, and small improvements in train --- docs/source/multi_gpu_training.mdx | 36 +++++++++++++--------------- src/lerobot/scripts/lerobot_train.py | 29 +++++++++++++++------- 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx index 497deb462..41e5b0794 100644 --- a/docs/source/multi_gpu_training.mdx +++ b/docs/source/multi_gpu_training.mdx @@ -13,27 +13,9 @@ pip install accelerate Or install it with the LeRobot accelerate extra: ```bash -pip install lerobot[accelerate] +pip install -e ".[accelerate]" ``` -## Configuration (Optional) - -You can optionally configure accelerate for your hardware setup by running: - -```bash -accelerate config -``` - -This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.) and saves the configuration for future use. For a simple multi-GPU setup on a single machine, you can use these recommended settings: - -- Compute environment: This machine -- Number of machines: 1 -- Number of processes: (number of GPUs you want to use) -- GPU ids to use: (leave empty to use all) -- Mixed precision: fp16 or bf16 (recommended for faster training) - -**Note:** You can skip this step and specify parameters directly in the launch command (see Option 1 below). - ## Training with Multiple GPUs You can launch training in two ways: @@ -64,7 +46,21 @@ accelerate launch \ ### Option 2: Using accelerate config -If you prefer to save your configuration, run `accelerate config` once and then simply launch with: +If you prefer to save your configuration, you can optionally configure accelerate for your hardware setup by running: + +```bash +accelerate config +``` + +This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.) and saves the configuration for future use. For a simple multi-GPU setup on a single machine, you can use these recommended settings: + +- Compute environment: This machine +- Number of machines: 1 +- Number of processes: (number of GPUs you want to use) +- GPU ids to use: (leave empty to use all) +- Mixed precision: fp16 or bf16 (recommended for faster training) + +Then launch training with: ```bash accelerate launch $(which lerobot-train) \ diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 88fa26997..9057f8ed2 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -101,12 +101,10 @@ def update_policy( if accelerator: accelerator.backward(loss) - accelerator.unscale_gradients(optimizer=optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_( - policy.parameters(), - grad_clip_norm, - error_if_nonfinite=False, - ) + if grad_clip_norm > 0: + grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm) + else: + grad_norm = torch.tensor(0.0, device=policy.device) optimizer.step() else: grad_scaler.scale(loss).backward() @@ -198,9 +196,18 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None): torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True + # Dataset loading synchronization: main process downloads first to avoid race conditions if is_main_process: logging.info("Creating dataset") - dataset = make_dataset(cfg) + dataset = make_dataset(cfg) + + # Wait for main process to finish downloading/caching dataset + if accelerator: + accelerator.wait_for_everyone() + + # Now all other processes can safely load the dataset + if not is_main_process: + dataset = make_dataset(cfg) # Create environment used for evaluating checkpoints during training on simulation data. # On real-world data, no need to create an environment as evaluations are done outside train.py, @@ -270,6 +277,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None): logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})") logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})") logging.info(f"{dataset.num_episodes=}") + if accelerator: + num_processes = accelerator.num_processes + effective_bs = cfg.batch_size * num_processes + logging.info(f"Effective batch size: {cfg.batch_size} x {num_processes} = {effective_bs}") logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") @@ -312,8 +323,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None): "dataloading_s": AverageMeter("data_s", ":.3f"), } + # Use effective batch size for proper epoch calculation in distributed training + effective_batch_size = cfg.batch_size * (accelerator.num_processes if accelerator else 1) train_tracker = MetricsTracker( - cfg.batch_size, + effective_batch_size, dataset.num_frames, dataset.num_episodes, train_metrics,