From 77e4ea2afa7fd2f066915efa1f9922e6a0af68f3 Mon Sep 17 00:00:00 2001 From: Nicolas Rabault Date: Thu, 25 Jun 2026 21:23:01 +0200 Subject: [PATCH] fix(jobs): default remote job timeout to 2d instead of the platform default HF Jobs applies its own short 30-minute timeout when none is sent, which silently kills long training runs. Pass an explicit, generous 2d cap by default; users can still override --job.timeout to fail fast or extend it. --- docs/source/il_robots.mdx | 2 +- src/lerobot/configs/default.py | 5 +++-- tests/jobs/test_job_config.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx index 595e580c2..178db13bb 100644 --- a/docs/source/il_robots.mdx +++ b/docs/source/il_robots.mdx @@ -630,7 +630,7 @@ If your dataset exists only locally (not yet on the Hub), it is automatically pu Every job (and any dataset pushed by the run) is tagged `lerobot` so it's easy to find on the Hub. Add your own with `--job.tags '["my-tag"]'`. -By default the job runs until training finishes, with no time limit. Cap it with an HF Jobs duration string if you want a hard ceiling, e.g. `--job.timeout=4h`. +By default the job is capped at `2d` (48h) of wall-clock. Override it with an HF Jobs duration string, e.g. `--job.timeout=4h` to fail faster or `--job.timeout=7d` for a longer run. > **Note:** the model repo is created up front (it holds the staged training config the job runs from). If a run fails before the model is pushed, that repo is left on the Hub so you can inspect it — it is not deleted automatically, so repeated failures can leave empty repos behind. Remove one with `hf repo delete `. diff --git a/src/lerobot/configs/default.py b/src/lerobot/configs/default.py index b97a38684..38991a665 100644 --- a/src/lerobot/configs/default.py +++ b/src/lerobot/configs/default.py @@ -156,8 +156,9 @@ class JobConfig: # Runtime image for the remote job (ignored for local runs). image: str = "huggingface/lerobot-gpu:latest" # Max wall-clock for the remote job as an HF Jobs duration string (e.g. "2h"). - # None (default) imposes no timeout — the job runs until the command finishes. - timeout: str | None = None + # Defaults to "2d": We pass an explicit, generous cap instead. Set a smaller + # value to fail fast, or a larger one for long runs. + timeout: str | None = "2d" # Submit and exit instead of streaming the job logs in the foreground. detach: bool = False # Extra tags attached to the HF job and to any dataset this run pushes to the diff --git a/tests/jobs/test_job_config.py b/tests/jobs/test_job_config.py index 7254e1fa1..20760fb18 100644 --- a/tests/jobs/test_job_config.py +++ b/tests/jobs/test_job_config.py @@ -24,7 +24,7 @@ def test_jobconfig_defaults_are_local(): assert cfg.target is None assert cfg.is_remote is False assert cfg.image == "huggingface/lerobot-gpu:latest" - assert cfg.timeout is None + assert cfg.timeout == "2d" assert cfg.detach is False