mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 11:09:59 +00:00
Place logging under accelerate and update docs
This commit is contained in:
@@ -16,15 +16,15 @@ Or install it with the LeRobot accelerate extra:
|
|||||||
pip install lerobot[accelerate]
|
pip install lerobot[accelerate]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Configuration
|
## Configuration (Optional)
|
||||||
|
|
||||||
Configure accelerate for your hardware setup by running:
|
You can optionally configure accelerate for your hardware setup by running:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
accelerate config
|
accelerate config
|
||||||
```
|
```
|
||||||
|
|
||||||
This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.). For a simple multi-GPU setup on a single machine, you can use these recommended settings:
|
This interactive setup will ask you questions about your training environment (number of GPUs, mixed precision settings, etc.) and saves the configuration for future use. For a simple multi-GPU setup on a single machine, you can use these recommended settings:
|
||||||
|
|
||||||
- Compute environment: This machine
|
- Compute environment: This machine
|
||||||
- Number of machines: 1
|
- Number of machines: 1
|
||||||
@@ -32,26 +32,15 @@ This interactive setup will ask you questions about your training environment (n
|
|||||||
- GPU ids to use: (leave empty to use all)
|
- GPU ids to use: (leave empty to use all)
|
||||||
- Mixed precision: fp16 or bf16 (recommended for faster training)
|
- Mixed precision: fp16 or bf16 (recommended for faster training)
|
||||||
|
|
||||||
|
**Note:** You can skip this step and specify parameters directly in the launch command (see Option 1 below).
|
||||||
|
|
||||||
## Training with Multiple GPUs
|
## Training with Multiple GPUs
|
||||||
|
|
||||||
You can launch training in two ways:
|
You can launch training in two ways:
|
||||||
|
|
||||||
### Option 1: Using accelerate config (recommended)
|
### Option 1: Without config (specify parameters directly)
|
||||||
|
|
||||||
If you ran `accelerate config`, simply launch with:
|
You can specify all parameters directly in the command without running `accelerate config`:
|
||||||
|
|
||||||
```bash
|
|
||||||
accelerate launch $(which lerobot-train) \
|
|
||||||
--dataset.repo_id=${HF_USER}/my_dataset \
|
|
||||||
--policy.type=act \
|
|
||||||
--output_dir=outputs/train/act_multi_gpu \
|
|
||||||
--job_name=act_multi_gpu \
|
|
||||||
--wandb.enable=true
|
|
||||||
```
|
|
||||||
|
|
||||||
### Option 2: Without config (specify parameters directly)
|
|
||||||
|
|
||||||
If you prefer not to run `accelerate config`, you can specify all parameters in the command:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
accelerate launch \
|
accelerate launch \
|
||||||
@@ -61,6 +50,7 @@ accelerate launch \
|
|||||||
$(which lerobot-train) \
|
$(which lerobot-train) \
|
||||||
--dataset.repo_id=${HF_USER}/my_dataset \
|
--dataset.repo_id=${HF_USER}/my_dataset \
|
||||||
--policy.type=act \
|
--policy.type=act \
|
||||||
|
--policy.repo_id=${HF_USER}/my_trained_policy \
|
||||||
--output_dir=outputs/train/act_multi_gpu \
|
--output_dir=outputs/train/act_multi_gpu \
|
||||||
--job_name=act_multi_gpu \
|
--job_name=act_multi_gpu \
|
||||||
--wandb.enable=true
|
--wandb.enable=true
|
||||||
@@ -71,6 +61,20 @@ accelerate launch \
|
|||||||
- `--num_processes=2`: Number of GPUs to use
|
- `--num_processes=2`: Number of GPUs to use
|
||||||
- `--mixed_precision=fp16`: Use fp16 mixed precision (or `bf16` if supported)
|
- `--mixed_precision=fp16`: Use fp16 mixed precision (or `bf16` if supported)
|
||||||
|
|
||||||
|
### Option 2: Using accelerate config
|
||||||
|
|
||||||
|
If you prefer to save your configuration, run `accelerate config` once and then simply launch with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
accelerate launch $(which lerobot-train) \
|
||||||
|
--dataset.repo_id=${HF_USER}/my_dataset \
|
||||||
|
--policy.type=act \
|
||||||
|
--policy.repo_id=${HF_USER}/my_trained_policy \
|
||||||
|
--output_dir=outputs/train/act_multi_gpu \
|
||||||
|
--job_name=act_multi_gpu \
|
||||||
|
--wandb.enable=true
|
||||||
|
```
|
||||||
|
|
||||||
## How It Works
|
## How It Works
|
||||||
|
|
||||||
When you launch training with accelerate:
|
When you launch training with accelerate:
|
||||||
|
|||||||
@@ -163,17 +163,20 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
|
|||||||
cfg: A `TrainPipelineConfig` object containing all training configurations.
|
cfg: A `TrainPipelineConfig` object containing all training configurations.
|
||||||
"""
|
"""
|
||||||
cfg.validate()
|
cfg.validate()
|
||||||
logging.info(pformat(cfg.to_dict()))
|
|
||||||
|
|
||||||
if accelerator and not accelerator.is_main_process:
|
if accelerator and not accelerator.is_main_process:
|
||||||
# Disable logging on non-main processes.
|
# Disable logging on non-main processes.
|
||||||
cfg.wandb.enable = False
|
cfg.wandb.enable = False
|
||||||
|
|
||||||
|
if not accelerator or accelerator.is_main_process:
|
||||||
|
logging.info(pformat(cfg.to_dict()))
|
||||||
|
|
||||||
if cfg.wandb.enable and cfg.wandb.project:
|
if cfg.wandb.enable and cfg.wandb.project:
|
||||||
wandb_logger = WandBLogger(cfg)
|
wandb_logger = WandBLogger(cfg)
|
||||||
else:
|
else:
|
||||||
wandb_logger = None
|
wandb_logger = None
|
||||||
logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"]))
|
if not accelerator or accelerator.is_main_process:
|
||||||
|
logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"]))
|
||||||
|
|
||||||
if cfg.seed is not None:
|
if cfg.seed is not None:
|
||||||
set_seed(cfg.seed, accelerator=accelerator)
|
set_seed(cfg.seed, accelerator=accelerator)
|
||||||
@@ -183,7 +186,8 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
|
|||||||
torch.backends.cudnn.benchmark = True
|
torch.backends.cudnn.benchmark = True
|
||||||
torch.backends.cuda.matmul.allow_tf32 = True
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
|
||||||
logging.info("Creating dataset")
|
if not accelerator or accelerator.is_main_process:
|
||||||
|
logging.info("Creating dataset")
|
||||||
dataset = make_dataset(cfg)
|
dataset = make_dataset(cfg)
|
||||||
|
|
||||||
# Create environment used for evaluating checkpoints during training on simulation data.
|
# Create environment used for evaluating checkpoints during training on simulation data.
|
||||||
@@ -191,10 +195,12 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
|
|||||||
# using the eval.py instead, with gym_dora environment and dora-rs.
|
# using the eval.py instead, with gym_dora environment and dora-rs.
|
||||||
eval_env = None
|
eval_env = None
|
||||||
if cfg.eval_freq > 0 and cfg.env is not None:
|
if cfg.eval_freq > 0 and cfg.env is not None:
|
||||||
logging.info("Creating env")
|
if not accelerator or accelerator.is_main_process:
|
||||||
|
logging.info("Creating env")
|
||||||
eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
|
eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
|
||||||
|
|
||||||
logging.info("Creating policy")
|
if not accelerator or accelerator.is_main_process:
|
||||||
|
logging.info("Creating policy")
|
||||||
policy = make_policy(
|
policy = make_policy(
|
||||||
cfg=cfg.policy,
|
cfg=cfg.policy,
|
||||||
ds_meta=dataset.meta,
|
ds_meta=dataset.meta,
|
||||||
@@ -232,7 +238,8 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
|
|||||||
**postprocessor_kwargs,
|
**postprocessor_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info("Creating optimizer and scheduler")
|
if not accelerator or accelerator.is_main_process:
|
||||||
|
logging.info("Creating optimizer and scheduler")
|
||||||
optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
|
optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
|
||||||
grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp)
|
grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp)
|
||||||
|
|
||||||
@@ -304,6 +311,7 @@ def train(cfg: TrainPipelineConfig, accelerator: Callable | None = None):
|
|||||||
|
|
||||||
if not accelerator or accelerator.is_main_process:
|
if not accelerator or accelerator.is_main_process:
|
||||||
logging.info("Start offline training on a fixed dataset")
|
logging.info("Start offline training on a fixed dataset")
|
||||||
|
|
||||||
for _ in range(step, cfg.steps):
|
for _ in range(step, cfg.steps):
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
batch = next(dl_iter)
|
batch = next(dl_iter)
|
||||||
|
|||||||
Reference in New Issue
Block a user