From 3c8e54dcfacb2f519d9e68662911e30855925b00 Mon Sep 17 00:00:00 2001 From: Nicolas Rabault Date: Thu, 25 Jun 2026 16:21:18 +0200 Subject: [PATCH] docs(jobs): document the model-pushed marker contract and orphaned repos Follow-up to the claude[bot] review on #3856 (non-blocking observations): - Cross-reference the "Model pushed to " log line between its producer (PreTrainedPolicy.push_model_to_hub) and the remote-run consumer in submit_to_hf, noting the contract is an early-finish optimization that falls back to status polling if it drifts. - Note in the HF Jobs guide that a failed remote run leaves its model repo on the Hub (it is not auto-deleted) and how to remove it. --- docs/source/il_robots.mdx | 2 ++ src/lerobot/jobs/hf.py | 5 ++++- src/lerobot/policies/pretrained.py | 3 +++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx index 16f043fc2..0b7009357 100644 --- a/docs/source/il_robots.mdx +++ b/docs/source/il_robots.mdx @@ -626,6 +626,8 @@ Every job (and any dataset pushed by the run) is tagged `lerobot` so it's easy t By default the job runs until training finishes, with no time limit. Cap it with an HF Jobs duration string if you want a hard ceiling, e.g. `--job.timeout=4h`. +> **Note:** the model repo is created up front (it holds the staged training config the job runs from). If a run fails before the model is pushed, that repo is left on the Hub so you can inspect it — it is not deleted automatically, so repeated failures can leave empty repos behind. Remove one with `hf repo delete `. + **Prerequisites:** run `hf auth login` before submitting. For Weights & Biases integration, run `wandb login` or set `WANDB_API_KEY` on your machine — the key is forwarded to the job automatically. #### Upload policy checkpoints diff --git a/src/lerobot/jobs/hf.py b/src/lerobot/jobs/hf.py index d7968683a..da4b826e1 100644 --- a/src/lerobot/jobs/hf.py +++ b/src/lerobot/jobs/hf.py @@ -300,7 +300,10 @@ def submit_to_hf(cfg: TrainPipelineConfig) -> None: poll_thread = threading.Thread(target=_poll, daemon=True) poll_thread.start() # Finish as soon as the model is pushed, rather than waiting out the platform's - # post-run finalization before the job stage flips to COMPLETED. + # post-run finalization before the job stage flips to COMPLETED. This matches the + # exact log line emitted by PreTrainedPolicy.push_model_to_hub — the two must stay + # in sync. If it ever stops matching we just fall back to stage-based completion + # (~30s slower), so the contract is an optimization, not a correctness requirement. success_marker = f"Model pushed to https://huggingface.co/{repo_id}" log_thread = threading.Thread( target=_tail_logs, args=(job_id, done, success_marker, pushed_ok), daemon=True diff --git a/src/lerobot/policies/pretrained.py b/src/lerobot/policies/pretrained.py index a7aabb3f3..aea5f1b08 100644 --- a/src/lerobot/policies/pretrained.py +++ b/src/lerobot/policies/pretrained.py @@ -340,6 +340,9 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC): ignore_patterns=["*.tmp", "*.log"], ) + # Contract: lerobot.jobs.hf.submit_to_hf watches for this exact + # "Model pushed to " line to end a remote run early. Keep the wording + # and URL format in sync (it falls back to status polling if they drift). logging.info(f"Model pushed to {commit_info.repo_url.url}") def generate_model_card(