mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-29 22:27:14 +00:00
fix(jobs): default remote job timeout to 2d instead of the platform default
HF Jobs applies its own short 30-minute timeout when none is sent, which silently kills long training runs. Pass an explicit, generous 2d cap by default; users can still override --job.timeout to fail fast or extend it.
This commit is contained in:
@@ -630,7 +630,7 @@ If your dataset exists only locally (not yet on the Hub), it is automatically pu
|
||||
|
||||
Every job (and any dataset pushed by the run) is tagged `lerobot` so it's easy to find on the Hub. Add your own with `--job.tags '["my-tag"]'`.
|
||||
|
||||
By default the job runs until training finishes, with no time limit. Cap it with an HF Jobs duration string if you want a hard ceiling, e.g. `--job.timeout=4h`.
|
||||
By default the job is capped at `2d` (48h) of wall-clock. Override it with an HF Jobs duration string, e.g. `--job.timeout=4h` to fail faster or `--job.timeout=7d` for a longer run.
|
||||
|
||||
> **Note:** the model repo is created up front (it holds the staged training config the job runs from). If a run fails before the model is pushed, that repo is left on the Hub so you can inspect it — it is not deleted automatically, so repeated failures can leave empty repos behind. Remove one with `hf repo delete <repo-id>`.
|
||||
|
||||
|
||||
@@ -156,8 +156,9 @@ class JobConfig:
|
||||
# Runtime image for the remote job (ignored for local runs).
|
||||
image: str = "huggingface/lerobot-gpu:latest"
|
||||
# Max wall-clock for the remote job as an HF Jobs duration string (e.g. "2h").
|
||||
# None (default) imposes no timeout — the job runs until the command finishes.
|
||||
timeout: str | None = None
|
||||
# Defaults to "2d": We pass an explicit, generous cap instead. Set a smaller
|
||||
# value to fail fast, or a larger one for long runs.
|
||||
timeout: str | None = "2d"
|
||||
# Submit and exit instead of streaming the job logs in the foreground.
|
||||
detach: bool = False
|
||||
# Extra tags attached to the HF job and to any dataset this run pushes to the
|
||||
|
||||
@@ -24,7 +24,7 @@ def test_jobconfig_defaults_are_local():
|
||||
assert cfg.target is None
|
||||
assert cfg.is_remote is False
|
||||
assert cfg.image == "huggingface/lerobot-gpu:latest"
|
||||
assert cfg.timeout is None
|
||||
assert cfg.timeout == "2d"
|
||||
assert cfg.detach is False
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user