Place logging under accelerate and update docs

This commit is contained in:
Pepijn
2025-10-10 11:25:53 +02:00
parent 52751e8e6d
commit 95b6035baa
2 changed files with 37 additions and 25 deletions
+22 -18
View File
@@ -16,15 +16,15 @@ Or install it with the LeRobot accelerate extra:
pip install lerobot[accelerate] pip install lerobot[accelerate]
``` ```
## Configuration ## Configuration (Optional)
Configure accelerate for your hardware setup by running: You can optionally configure accelerate for your hardware setup by running:
```bash ```bash
accelerate config accelerate config
``` ```
This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.). For a simple multi-GPU setup on a single machine, you can use these recommended settings: This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.) and saves the configuration for future use. For a simple multi-GPU setup on a single machine, you can use these recommended settings:
- Compute environment: This machine - Compute environment: This machine
- Number of machines: 1 - Number of machines: 1
@@ -32,26 +32,15 @@ This interactive setup will ask you questions about your training environment (n
- GPU ids to use: (leave empty to use all) - GPU ids to use: (leave empty to use all)
- Mixed precision: fp16 or bf16 (recommended for faster training) - Mixed precision: fp16 or bf16 (recommended for faster training)
**Note:** You can skip this step and specify parameters directly in the launch command (see Option 1 below).
## Training with Multiple GPUs ## Training with Multiple GPUs
You can launch training in two ways: You can launch training in two ways:
### Option 1: Using accelerate config (recommended) ### Option 1: Without config (specify parameters directly)
If you ran `accelerate config`, simply launch with: You can specify all parameters directly in the command without running `accelerate config`:
```bash
accelerate launch $(which lerobot-train) \
--dataset.repo_id=${HF_USER}/my_dataset \
--policy.type=act \
--output_dir=outputs/train/act_multi_gpu \
--job_name=act_multi_gpu \
--wandb.enable=true
```
### Option 2: Without config (specify parameters directly)
If you prefer not to run `accelerate config`, you can specify all parameters in the command:
```bash ```bash
accelerate launch \ accelerate launch \
@@ -61,6 +50,7 @@ accelerate launch \
$(which lerobot-train) \ $(which lerobot-train) \
--dataset.repo_id=${HF_USER}/my_dataset \ --dataset.repo_id=${HF_USER}/my_dataset \
--policy.type=act \ --policy.type=act \
--policy.repo_id=${HF_USER}/my_trained_policy \
--output_dir=outputs/train/act_multi_gpu \ --output_dir=outputs/train/act_multi_gpu \
--job_name=act_multi_gpu \ --job_name=act_multi_gpu \
--wandb.enable=true --wandb.enable=true
@@ -71,6 +61,20 @@ accelerate launch \
- `--num_processes=2`: Number of GPUs to use - `--num_processes=2`: Number of GPUs to use
- `--mixed_precision=fp16`: Use fp16 mixed precision (or `bf16` if supported) - `--mixed_precision=fp16`: Use fp16 mixed precision (or `bf16` if supported)
### Option 2: Using accelerate config
If you prefer to save your configuration, run `accelerate config` once and then simply launch with:
```bash
accelerate launch $(which lerobot-train) \
--dataset.repo_id=${HF_USER}/my_dataset \
--policy.type=act \
--policy.repo_id=${HF_USER}/my_trained_policy \
--output_dir=outputs/train/act_multi_gpu \
--job_name=act_multi_gpu \
--wandb.enable=true
```
## How It Works ## How It Works
When you launch training with accelerate: When you launch training with accelerate:
+15 -7
View File
@@ -163,17 +163,20 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
cfg: A `TrainPipelineConfig` object containing all training configurations. cfg: A `TrainPipelineConfig` object containing all training configurations.
""" """
cfg.validate() cfg.validate()
logging.info(pformat(cfg.to_dict()))
if accelerator and not accelerator.is_main_process: if accelerator and not accelerator.is_main_process:
# Disable logging on non-main processes. # Disable logging on non-main processes.
cfg.wandb.enable = False cfg.wandb.enable = False
if not accelerator or accelerator.is_main_process:
logging.info(pformat(cfg.to_dict()))
if cfg.wandb.enable and cfg.wandb.project: if cfg.wandb.enable and cfg.wandb.project:
wandb_logger = WandBLogger(cfg) wandb_logger = WandBLogger(cfg)
else: else:
wandb_logger = None wandb_logger = None
logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"])) if not accelerator or accelerator.is_main_process:
logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"]))
if cfg.seed is not None: if cfg.seed is not None:
set_seed(cfg.seed, accelerator=accelerator) set_seed(cfg.seed, accelerator=accelerator)
@@ -183,7 +186,8 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
torch.backends.cudnn.benchmark = True torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cuda.matmul.allow_tf32 = True
logging.info("Creating dataset") if not accelerator or accelerator.is_main_process:
logging.info("Creating dataset")
dataset = make_dataset(cfg) dataset = make_dataset(cfg)
# Create environment used for evaluating checkpoints during training on simulation data. # Create environment used for evaluating checkpoints during training on simulation data.
@@ -191,10 +195,12 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
# using the eval.py instead, with gym_dora environment and dora-rs. # using the eval.py instead, with gym_dora environment and dora-rs.
eval_env = None eval_env = None
if cfg.eval_freq > 0 and cfg.env is not None: if cfg.eval_freq > 0 and cfg.env is not None:
logging.info("Creating env") if not accelerator or accelerator.is_main_process:
logging.info("Creating env")
eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
logging.info("Creating policy") if not accelerator or accelerator.is_main_process:
logging.info("Creating policy")
policy = make_policy( policy = make_policy(
cfg=cfg.policy, cfg=cfg.policy,
ds_meta=dataset.meta, ds_meta=dataset.meta,
@@ -232,7 +238,8 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
**postprocessor_kwargs, **postprocessor_kwargs,
) )
logging.info("Creating optimizer and scheduler") if not accelerator or accelerator.is_main_process:
logging.info("Creating optimizer and scheduler")
optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp) grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp)
@@ -304,6 +311,7 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
if not accelerator or accelerator.is_main_process: if not accelerator or accelerator.is_main_process:
logging.info("Start offline training on a fixed dataset") logging.info("Start offline training on a fixed dataset")
for _ in range(step, cfg.steps): for _ in range(step, cfg.steps):
start_time = time.perf_counter() start_time = time.perf_counter()
batch = next(dl_iter) batch = next(dl_iter)