From 3c8e54dcfacb2f519d9e68662911e30855925b00 Mon Sep 17 00:00:00 2001
From: Nicolas Rabault <rabault.nicolas@gmail.com>
Date: Thu, 25 Jun 2026 16:21:18 +0200
Subject: [PATCH] docs(jobs): document the model-pushed marker contract and
 orphaned repos

Follow-up to the claude[bot] review on #3856 (non-blocking observations):

- Cross-reference the "Model pushed to <url>" log line between its producer
  (PreTrainedPolicy.push_model_to_hub) and the remote-run consumer in
  submit_to_hf, noting the contract is an early-finish optimization that
  falls back to status polling if it drifts.
- Note in the HF Jobs guide that a failed remote run leaves its model repo
  on the Hub (it is not auto-deleted) and how to remove it.
---
 docs/source/il_robots.mdx          | 2 ++
 src/lerobot/jobs/hf.py             | 5 ++++-
 src/lerobot/policies/pretrained.py | 3 +++
 3 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx
index 16f043fc2..0b7009357 100644
--- a/docs/source/il_robots.mdx
+++ b/docs/source/il_robots.mdx
@@ -626,6 +626,8 @@ Every job (and any dataset pushed by the run) is tagged `lerobot` so it's easy t
 
 By default the job runs until training finishes, with no time limit. Cap it with an HF Jobs duration string if you want a hard ceiling, e.g. `--job.timeout=4h`.
 
+> **Note:** the model repo is created up front (it holds the staged training config the job runs from). If a run fails before the model is pushed, that repo is left on the Hub so you can inspect it — it is not deleted automatically, so repeated failures can leave empty repos behind. Remove one with `hf repo delete <repo-id>`.
+
 **Prerequisites:** run `hf auth login` before submitting. For Weights & Biases integration, run `wandb login` or set `WANDB_API_KEY` on your machine — the key is forwarded to the job automatically.
 
 #### Upload policy checkpoints
diff --git a/src/lerobot/jobs/hf.py b/src/lerobot/jobs/hf.py
index d7968683a..da4b826e1 100644
--- a/src/lerobot/jobs/hf.py
+++ b/src/lerobot/jobs/hf.py
@@ -300,7 +300,10 @@ def submit_to_hf(cfg: TrainPipelineConfig) -> None:
     poll_thread = threading.Thread(target=_poll, daemon=True)
     poll_thread.start()
     # Finish as soon as the model is pushed, rather than waiting out the platform's
-    # post-run finalization before the job stage flips to COMPLETED.
+    # post-run finalization before the job stage flips to COMPLETED. This matches the
+    # exact log line emitted by PreTrainedPolicy.push_model_to_hub — the two must stay
+    # in sync. If it ever stops matching we just fall back to stage-based completion
+    # (~30s slower), so the contract is an optimization, not a correctness requirement.
     success_marker = f"Model pushed to https://huggingface.co/{repo_id}"
     log_thread = threading.Thread(
         target=_tail_logs, args=(job_id, done, success_marker, pushed_ok), daemon=True
diff --git a/src/lerobot/policies/pretrained.py b/src/lerobot/policies/pretrained.py
index a7aabb3f3..aea5f1b08 100644
--- a/src/lerobot/policies/pretrained.py
+++ b/src/lerobot/policies/pretrained.py
@@ -340,6 +340,9 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC):
                 ignore_patterns=["*.tmp", "*.log"],
             )
 
+            # Contract: lerobot.jobs.hf.submit_to_hf watches for this exact
+            # "Model pushed to <url>" line to end a remote run early. Keep the wording
+            # and URL format in sync (it falls back to status polling if they drift).
             logging.info(f"Model pushed to {commit_info.repo_url.url}")
 
     def generate_model_card(