update docs, and small improvements in train

This commit is contained in:
Pepijn
2025-10-14 13:31:52 +02:00
parent a86cea5708
commit 50ff388bf6
2 changed files with 37 additions and 28 deletions
+16 -20
View File
@@ -13,27 +13,9 @@ pip install accelerate
Or install it with the LeRobot accelerate extra: Or install it with the LeRobot accelerate extra:
```bash ```bash
pip install lerobot[accelerate] pip install -e ".[accelerate]"
``` ```
## Configuration (Optional)
You can optionally configure accelerate for your hardware setup by running:
```bash
accelerate config
```
This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.) and saves the configuration for future use. For a simple multi-GPU setup on a single machine, you can use these recommended settings:
- Compute environment: This machine
- Number of machines: 1
- Number of processes: (number of GPUs you want to use)
- GPU ids to use: (leave empty to use all)
- Mixed precision: fp16 or bf16 (recommended for faster training)
**Note:** You can skip this step and specify parameters directly in the launch command (see Option 1 below).
## Training with Multiple GPUs ## Training with Multiple GPUs
You can launch training in two ways: You can launch training in two ways:
@@ -64,7 +46,21 @@ accelerate launch \
### Option 2: Using accelerate config ### Option 2: Using accelerate config
If you prefer to save your configuration, run `accelerate config` once and then simply launch with: If you prefer to save your configuration, you can optionally configure accelerate for your hardware setup by running:
```bash
accelerate config
```
This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.) and saves the configuration for future use. For a simple multi-GPU setup on a single machine, you can use these recommended settings:
- Compute environment: This machine
- Number of machines: 1
- Number of processes: (number of GPUs you want to use)
- GPU ids to use: (leave empty to use all)
- Mixed precision: fp16 or bf16 (recommended for faster training)
Then launch training with:
```bash ```bash
accelerate launch $(which lerobot-train) \ accelerate launch $(which lerobot-train) \
+21 -8
View File
@@ -101,12 +101,10 @@ def update_policy(
if accelerator: if accelerator:
accelerator.backward(loss) accelerator.backward(loss)
accelerator.unscale_gradients(optimizer=optimizer) if grad_clip_norm > 0:
grad_norm = torch.nn.utils.clip_grad_norm_( grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
policy.parameters(), else:
grad_clip_norm, grad_norm = torch.tensor(0.0, device=policy.device)
error_if_nonfinite=False,
)
optimizer.step() optimizer.step()
else: else:
grad_scaler.scale(loss).backward() grad_scaler.scale(loss).backward()
@@ -198,9 +196,18 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
torch.backends.cudnn.benchmark = True torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cuda.matmul.allow_tf32 = True
# Dataset loading synchronization: main process downloads first to avoid race conditions
if is_main_process: if is_main_process:
logging.info("Creating dataset") logging.info("Creating dataset")
dataset = make_dataset(cfg) dataset = make_dataset(cfg)
# Wait for main process to finish downloading/caching dataset
if accelerator:
accelerator.wait_for_everyone()
# Now all other processes can safely load the dataset
if not is_main_process:
dataset = make_dataset(cfg)
# Create environment used for evaluating checkpoints during training on simulation data. # Create environment used for evaluating checkpoints during training on simulation data.
# On real-world data, no need to create an environment as evaluations are done outside train.py, # On real-world data, no need to create an environment as evaluations are done outside train.py,
@@ -270,6 +277,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})") logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})") logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
logging.info(f"{dataset.num_episodes=}") logging.info(f"{dataset.num_episodes=}")
if accelerator:
num_processes = accelerator.num_processes
effective_bs = cfg.batch_size * num_processes
logging.info(f"Effective batch size: {cfg.batch_size} x {num_processes} = {effective_bs}")
logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
@@ -312,8 +323,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
"dataloading_s": AverageMeter("data_s", ":.3f"), "dataloading_s": AverageMeter("data_s", ":.3f"),
} }
# Use effective batch size for proper epoch calculation in distributed training
effective_batch_size = cfg.batch_size * (accelerator.num_processes if accelerator else 1)
train_tracker = MetricsTracker( train_tracker = MetricsTracker(
cfg.batch_size, effective_batch_size,
dataset.num_frames, dataset.num_frames,
dataset.num_episodes, dataset.num_episodes,
train_metrics, train_metrics,