fix formatting

2026-07-23 01:41:54 +00:00 · 2025-10-14 17:37:47 +02:00
parent f8a185f753
commit ebf64bd80e
6 changed files with 50 additions and 65 deletions
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 import os
 import time
 from contextlib import nullcontext
 from pprint import pformat
@@ -143,16 +142,13 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
    # Create Accelerator if not provided
    # It will automatically detect if running in distributed mode or single-process mode
-    # We set step_scheduler_with_optimizer=False to prevent accelerate from adjusting
+    # We set step_scheduler_with_optimizer=False to prevent accelerate from adjusting the lr_scheduler steps based on the num_processes
-    # the lr_scheduler steps based on the num_processes
+    # We set find_unused_parameters=True to handle models with conditional computation
    # We set find_unused_parameters=True to handle models with conditional computation paths
    if accelerator is None:
        from accelerate.utils import DistributedDataParallelKwargs
        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
-        accelerator = Accelerator(
+        accelerator = Accelerator(step_scheduler_with_optimizer=False, kwargs_handlers=[ddp_kwargs])
            step_scheduler_with_optimizer=False,
            kwargs_handlers=[ddp_kwargs]
        )
    # Determine if this is the main process (for logging and checkpointing)
    # When using accelerate, only the main process should log to avoid duplicate outputs
@@ -183,7 +179,6 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
        logging.info("Creating dataset")
        dataset = make_dataset(cfg)
    # Wait for main process to finish downloading/caching dataset
    accelerator.wait_for_everyone()
    # Now all other processes can safely load the dataset
@@ -341,7 +336,7 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
        step += 1
        train_tracker.step()
        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 and is_main_process
-        is_saving_step = (step % cfg.save_freq == 0 or step == cfg.steps)
+        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
        is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0
        if is_log_step:
@@ -21,7 +21,6 @@ import subprocess
 import sys
 import time
 from collections.abc import Callable
 from accelerate import Accelerator
 from copy import copy, deepcopy
 from datetime import datetime
 from pathlib import Path
@@ -29,6 +28,7 @@ from statistics import mean
 import numpy as np
 import torch
 from accelerate import Accelerator
 from datasets.utils.logging import disable_progress_bar, enable_progress_bar
@@ -128,6 +128,7 @@ def init_logging(
        file_level: Logging level for file output
        accelerator: Optional Accelerator instance (for multi-GPU detection)
    """
    def custom_format(record: logging.LogRecord) -> str:
        dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        fnameline = f"{record.pathname}:{record.lineno}"
@@ -159,7 +160,6 @@ def init_logging(
        logger.addHandler(logging.NullHandler())
        logger.setLevel(logging.ERROR)
    # File logging (optional, all processes)
    if log_file is not None:
        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(formatter)
@@ -178,6 +178,7 @@ def format_big_number(num, precision=0):
    return num
 def say(text: str, blocking: bool = False):
    system = platform.system()
@@ -25,9 +25,7 @@ The tests automatically generate accelerate configs and launch training
 with subprocess to properly test the distributed training environment.
 """
 import json
 import os
 import shutil
 import subprocess
 import tempfile
 from pathlib import Path
@@ -70,15 +68,6 @@ def run_accelerate_training(config_args, num_processes=4, temp_dir=None):
    Returns:
        subprocess.CompletedProcess result
    """
    # Create accelerate config
    accelerate_config = {
        "compute_environment": "LOCAL_MACHINE",
        "distributed_type": "MULTI_GPU",
        "mixed_precision": "no",
        "num_processes": num_processes,
        "use_cpu": False,
        "gpu_ids": "all",
    }
    config_path = Path(temp_dir) / "accelerate_config.yaml"