cleanup

2026-06-18 00:37:10 +00:00 · 2026-06-15 14:50:23 +00:00
parent 3ce50c3468
commit b42d124007
2 changed files with 3 additions and 4 deletions
@@ -138,10 +138,10 @@ num_machines: 1
 num_processes: 4
 fsdp_config:
  fsdp_version: 1
-  fsdp_sharding_strategy: FULL_SHARD                          # params + grads + optimizer (ZeRO-3)
+  fsdp_sharding_strategy: FULL_SHARD # params + grads + optimizer (ZeRO-3)
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: <YourTransformerBlock>  # repeated block class to shard
-  fsdp_use_orig_params: true                                  # required: optimizer is built pre-prepare
+  fsdp_transformer_layer_cls_to_wrap: <YourTransformerBlock> # repeated block class to shard
+  fsdp_use_orig_params: true # required: optimizer is built pre-prepare
  fsdp_state_dict_type: FULL_STATE_DICT
 ```

@@ -198,7 +198,6 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
    # We set step_scheduler_with_optimizer=False to prevent accelerate from adjusting the lr_scheduler steps based on the num_processes
    # We set find_unused_parameters=True to handle models with conditional computation
    if accelerator is None:
-
        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
        # Accelerate auto-detects the device based on the available hardware and ignores the policy.device setting.
        # Force the device to be CPU when the active config's device is set to CPU (works for both policy and reward model training).