From e1cf646e84e1b8748d66cb5ad994d79200128009 Mon Sep 17 00:00:00 2001 From: pepijn Date: Mon, 29 Jun 2026 14:14:24 +0000 Subject: [PATCH] fix(logging): correct multi-rank "max" metric reduction accelerate.reduce only implements sum/mean (max silently returned the SUM across ranks, inflating max-reduced metrics by num_processes). Gather per-rank values and reduce explicitly for max/sum/mean. Co-authored-by: Cursor --- src/lerobot/utils/logging_utils.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/lerobot/utils/logging_utils.py b/src/lerobot/utils/logging_utils.py index 20673fc30..eb02b4ba6 100644 --- a/src/lerobot/utils/logging_utils.py +++ b/src/lerobot/utils/logging_utils.py @@ -176,10 +176,21 @@ class MetricsTracker: if not buckets: return + # NB: don't use ``accelerator.reduce(..., reduction="max")`` — accelerate only implements + # "sum"/"mean" (it always all-reduces with SUM and divides for "mean"), so "max" silently + # returns the SUM across ranks, inflating every "max" metric by ``num_processes`` (e.g. a + # 3.5s step reported as 28s on 8 GPUs). Gather per-rank values and reduce them explicitly. device = self.accelerator.device + num_processes = self.accelerator.num_processes for reduction, names in buckets.items(): - tensor = torch.tensor([self.metrics[n].avg for n in names], dtype=torch.float32, device=device) - reduced = self.accelerator.reduce(tensor, reduction=reduction) + local = torch.tensor([self.metrics[n].avg for n in names], dtype=torch.float32, device=device) + gathered = self.accelerator.gather(local).view(num_processes, len(names)) + if reduction == "max": + reduced = gathered.amax(dim=0) + elif reduction == "sum": + reduced = gathered.sum(dim=0) + else: # "mean" + reduced = gathered.mean(dim=0) for name, value in zip(names, reduced.tolist(), strict=True): meter = self.metrics[name] # Preserve avg == sum / count so a later .update() on this meter accumulates