mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-21 11:39:50 +00:00
update docs, and small improvements in train
This commit is contained in:
@@ -13,27 +13,9 @@ pip install accelerate
|
|||||||
Or install it with the LeRobot accelerate extra:
|
Or install it with the LeRobot accelerate extra:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install lerobot[accelerate]
|
pip install -e ".[accelerate]"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Configuration (Optional)
|
|
||||||
|
|
||||||
You can optionally configure accelerate for your hardware setup by running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
accelerate config
|
|
||||||
```
|
|
||||||
|
|
||||||
This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.) and saves the configuration for future use. For a simple multi-GPU setup on a single machine, you can use these recommended settings:
|
|
||||||
|
|
||||||
- Compute environment: This machine
|
|
||||||
- Number of machines: 1
|
|
||||||
- Number of processes: (number of GPUs you want to use)
|
|
||||||
- GPU ids to use: (leave empty to use all)
|
|
||||||
- Mixed precision: fp16 or bf16 (recommended for faster training)
|
|
||||||
|
|
||||||
**Note:** You can skip this step and specify parameters directly in the launch command (see Option 1 below).
|
|
||||||
|
|
||||||
## Training with Multiple GPUs
|
## Training with Multiple GPUs
|
||||||
|
|
||||||
You can launch training in two ways:
|
You can launch training in two ways:
|
||||||
@@ -64,7 +46,21 @@ accelerate launch \
|
|||||||
|
|
||||||
### Option 2: Using accelerate config
|
### Option 2: Using accelerate config
|
||||||
|
|
||||||
If you prefer to save your configuration, run `accelerate config` once and then simply launch with:
|
If you prefer to save your configuration, you can optionally configure accelerate for your hardware setup by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
accelerate config
|
||||||
|
```
|
||||||
|
|
||||||
|
This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.) and saves the configuration for future use. For a simple multi-GPU setup on a single machine, you can use these recommended settings:
|
||||||
|
|
||||||
|
- Compute environment: This machine
|
||||||
|
- Number of machines: 1
|
||||||
|
- Number of processes: (number of GPUs you want to use)
|
||||||
|
- GPU ids to use: (leave empty to use all)
|
||||||
|
- Mixed precision: fp16 or bf16 (recommended for faster training)
|
||||||
|
|
||||||
|
Then launch training with:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
accelerate launch $(which lerobot-train) \
|
accelerate launch $(which lerobot-train) \
|
||||||
|
|||||||
@@ -101,12 +101,10 @@ def update_policy(
|
|||||||
|
|
||||||
if accelerator:
|
if accelerator:
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
accelerator.unscale_gradients(optimizer=optimizer)
|
if grad_clip_norm > 0:
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
|
||||||
policy.parameters(),
|
else:
|
||||||
grad_clip_norm,
|
grad_norm = torch.tensor(0.0, device=policy.device)
|
||||||
error_if_nonfinite=False,
|
|
||||||
)
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
else:
|
else:
|
||||||
grad_scaler.scale(loss).backward()
|
grad_scaler.scale(loss).backward()
|
||||||
@@ -198,9 +196,18 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
|
|||||||
torch.backends.cudnn.benchmark = True
|
torch.backends.cudnn.benchmark = True
|
||||||
torch.backends.cuda.matmul.allow_tf32 = True
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
|
||||||
|
# Dataset loading synchronization: main process downloads first to avoid race conditions
|
||||||
if is_main_process:
|
if is_main_process:
|
||||||
logging.info("Creating dataset")
|
logging.info("Creating dataset")
|
||||||
dataset = make_dataset(cfg)
|
dataset = make_dataset(cfg)
|
||||||
|
|
||||||
|
# Wait for main process to finish downloading/caching dataset
|
||||||
|
if accelerator:
|
||||||
|
accelerator.wait_for_everyone()
|
||||||
|
|
||||||
|
# Now all other processes can safely load the dataset
|
||||||
|
if not is_main_process:
|
||||||
|
dataset = make_dataset(cfg)
|
||||||
|
|
||||||
# Create environment used for evaluating checkpoints during training on simulation data.
|
# Create environment used for evaluating checkpoints during training on simulation data.
|
||||||
# On real-world data, no need to create an environment as evaluations are done outside train.py,
|
# On real-world data, no need to create an environment as evaluations are done outside train.py,
|
||||||
@@ -270,6 +277,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
|
|||||||
logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
|
logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
|
||||||
logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
|
logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
|
||||||
logging.info(f"{dataset.num_episodes=}")
|
logging.info(f"{dataset.num_episodes=}")
|
||||||
|
if accelerator:
|
||||||
|
num_processes = accelerator.num_processes
|
||||||
|
effective_bs = cfg.batch_size * num_processes
|
||||||
|
logging.info(f"Effective batch size: {cfg.batch_size} x {num_processes} = {effective_bs}")
|
||||||
logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
|
logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
|
||||||
logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
|
logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
|
||||||
|
|
||||||
@@ -312,8 +323,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
|
|||||||
"dataloading_s": AverageMeter("data_s", ":.3f"),
|
"dataloading_s": AverageMeter("data_s", ":.3f"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Use effective batch size for proper epoch calculation in distributed training
|
||||||
|
effective_batch_size = cfg.batch_size * (accelerator.num_processes if accelerator else 1)
|
||||||
train_tracker = MetricsTracker(
|
train_tracker = MetricsTracker(
|
||||||
cfg.batch_size,
|
effective_batch_size,
|
||||||
dataset.num_frames,
|
dataset.num_frames,
|
||||||
dataset.num_episodes,
|
dataset.num_episodes,
|
||||||
train_metrics,
|
train_metrics,
|
||||||
|
|||||||
Reference in New Issue
Block a user