From 2fb5c7add07e0afe19f020399481f736f7d64d24 Mon Sep 17 00:00:00 2001 From: Ignat Georgiev Date: Sun, 8 Mar 2026 13:29:33 +0200 Subject: [PATCH] feat(train): add cudnn_deterministic option for reproducible training (#3102) Add a `cudnn_deterministic` flag to `TrainPipelineConfig` (default: False) that sets `torch.backends.cudnn.deterministic = True` and disables benchmark mode, eliminating CUDA floating-point non-determinism at the cost of ~10-20% training speed. When False (default) the existing benchmark=True behaviour is preserved. --- src/lerobot/configs/train.py | 3 +++ src/lerobot/scripts/lerobot_train.py | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py index 7a5eee77d..9d20afc68 100644 --- a/src/lerobot/configs/train.py +++ b/src/lerobot/configs/train.py @@ -50,6 +50,9 @@ class TrainPipelineConfig(HubMixin): # `seed` is used for training (eg: model initialization, dataset shuffling) # AND for the evaluation environments. seed: int | None = 1000 + # Set to True to use deterministic cuDNN algorithms for reproducibility. + # This disables cudnn.benchmark and may reduce training speed by ~10-20%. + cudnn_deterministic: bool = False # Number of workers for the dataloader. num_workers: int = 4 batch_size: int = 8 diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 04d43d91e..1fed3bee4 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -209,7 +209,11 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): # Use accelerator's device device = accelerator.device - torch.backends.cudnn.benchmark = True + if cfg.cudnn_deterministic: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + else: + torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True # Dataset loading synchronization: main process downloads first to avoid race conditions