diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx index 4cda9cb01..a6cd540bc 100644 --- a/docs/source/multi_gpu_training.mdx +++ b/docs/source/multi_gpu_training.mdx @@ -16,15 +16,15 @@ Or install it with the LeRobot accelerate extra: pip install lerobot[accelerate] ``` -## Configuration +## Configuration (Optional) -Configure accelerate for your hardware setup by running: +You can optionally configure accelerate for your hardware setup by running: ```bash accelerate config ``` -This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.). For a simple multi-GPU setup on a single machine, you can use these recommended settings: +This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.) and saves the configuration for future use. For a simple multi-GPU setup on a single machine, you can use these recommended settings: - Compute environment: This machine - Number of machines: 1 @@ -32,26 +32,15 @@ This interactive setup will ask you questions about your training environment (n - GPU ids to use: (leave empty to use all) - Mixed precision: fp16 or bf16 (recommended for faster training) +**Note:** You can skip this step and specify parameters directly in the launch command (see Option 1 below). + ## Training with Multiple GPUs You can launch training in two ways: -### Option 1: Using accelerate config (recommended) +### Option 1: Without config (specify parameters directly) -If you ran `accelerate config`, simply launch with: - -```bash -accelerate launch $(which lerobot-train) \ - --dataset.repo_id=${HF_USER}/my_dataset \ - --policy.type=act \ - --output_dir=outputs/train/act_multi_gpu \ - --job_name=act_multi_gpu \ - --wandb.enable=true -``` - -### Option 2: Without config (specify parameters directly) - -If you prefer not to run `accelerate config`, you can specify all parameters in the command: +You can specify all parameters directly in the command without running `accelerate config`: ```bash accelerate launch \ @@ -61,6 +50,7 @@ accelerate launch \ $(which lerobot-train) \ --dataset.repo_id=${HF_USER}/my_dataset \ --policy.type=act \ + --policy.repo_id=${HF_USER}/my_trained_policy \ --output_dir=outputs/train/act_multi_gpu \ --job_name=act_multi_gpu \ --wandb.enable=true @@ -71,6 +61,20 @@ accelerate launch \ - `--num_processes=2`: Number of GPUs to use - `--mixed_precision=fp16`: Use fp16 mixed precision (or `bf16` if supported) +### Option 2: Using accelerate config + +If you prefer to save your configuration, run `accelerate config` once and then simply launch with: + +```bash +accelerate launch $(which lerobot-train) \ + --dataset.repo_id=${HF_USER}/my_dataset \ + --policy.type=act \ + --policy.repo_id=${HF_USER}/my_trained_policy \ + --output_dir=outputs/train/act_multi_gpu \ + --job_name=act_multi_gpu \ + --wandb.enable=true +``` + ## How It Works When you launch training with accelerate: diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 6202990ed..65e86cb46 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -163,17 +163,20 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None): cfg: A `TrainPipelineConfig` object containing all training configurations. """ cfg.validate() - logging.info(pformat(cfg.to_dict())) - + if accelerator and not accelerator.is_main_process: # Disable logging on non-main processes. cfg.wandb.enable = False + if not accelerator or accelerator.is_main_process: + logging.info(pformat(cfg.to_dict())) + if cfg.wandb.enable and cfg.wandb.project: wandb_logger = WandBLogger(cfg) else: wandb_logger = None - logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"])) + if not accelerator or accelerator.is_main_process: + logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"])) if cfg.seed is not None: set_seed(cfg.seed, accelerator=accelerator) @@ -183,7 +186,8 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None): torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True - logging.info("Creating dataset") + if not accelerator or accelerator.is_main_process: + logging.info("Creating dataset") dataset = make_dataset(cfg) # Create environment used for evaluating checkpoints during training on simulation data. @@ -191,10 +195,12 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None): # using the eval.py instead, with gym_dora environment and dora-rs. eval_env = None if cfg.eval_freq > 0 and cfg.env is not None: - logging.info("Creating env") + if not accelerator or accelerator.is_main_process: + logging.info("Creating env") eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) - logging.info("Creating policy") + if not accelerator or accelerator.is_main_process: + logging.info("Creating policy") policy = make_policy( cfg=cfg.policy, ds_meta=dataset.meta, @@ -232,7 +238,8 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None): **postprocessor_kwargs, ) - logging.info("Creating optimizer and scheduler") + if not accelerator or accelerator.is_main_process: + logging.info("Creating optimizer and scheduler") optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp) @@ -304,6 +311,7 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None): if not accelerator or accelerator.is_main_process: logging.info("Start offline training on a fixed dataset") + for _ in range(step, cfg.steps): start_time = time.perf_counter() batch = next(dl_iter)