feat(annotate): add dest_repo_id for separate push target

Adds an optional `dest_repo_id` to AnnotationPipelineConfig. When set, `push_to_hub` uploads the annotated dataset there instead of overwriting the source `repo_id`, restoring separate source/destination repos. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 10:40:04 +00:00 · 2026-05-18 15:05:23 +02:00
parent 9dfc9084e1
commit c5676ef1b3
3 changed files with 29 additions and 11 deletions
@@ -20,10 +20,13 @@ Spawns one ``h200x2`` job that:
  1. installs this branch of ``lerobot`` plus the annotation extras,
  2. boots two vllm servers (one per GPU) with Qwen3.6-35B-A3B-FP8,
  3. runs the plan / interjections / vqa modules across the dataset,
-  4. uploads the annotated dataset back to ``--repo_id``.
+  4. uploads the annotated dataset back to ``--repo_id`` (or to
+     ``--dest_repo_id`` when set).

-``--repo_id`` is both the download source and, with ``--push_to_hub=true``,
-the upload destination — the job annotates the dataset in place.
+``--repo_id`` is the download source and, with ``--push_to_hub=true``, also
+the default upload destination — the job annotates the dataset in place.
+Pass ``--dest_repo_id`` to push the result to a separate repo instead and
+leave the source untouched.

 Usage:

@@ -53,9 +56,11 @@ CMD = (
    "export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 && "
    "export VLLM_VIDEO_BACKEND=pyav && "
    "lerobot-annotate "
-    # The dataset to annotate; also the push destination (annotate in place).
+    # The dataset to annotate. By default it is also the push destination
+    # (annotate in place); pass --dest_repo_id to push to a separate repo.
    "--repo_id=<your-org>/<your-dataset> "
    "--push_to_hub=true "
+    # "--dest_repo_id=<your-org>/<your-annotated-dataset> "
    "--vlm.backend=openai "
    "--vlm.model_id=Qwen/Qwen3.6-35B-A3B-FP8 "
    "--vlm.parallel_servers=2 "
@@ -163,8 +163,15 @@ class AnnotationPipelineConfig:
    """

    # Hub dataset id. Used as the download source when ``root`` is unset,
-    # and as the destination repo when ``push_to_hub`` is enabled.
+    # and as the destination repo when ``push_to_hub`` is enabled and
+    # ``dest_repo_id`` is unset.
    repo_id: str | None = None
+
+    # Optional separate Hub dataset id to push the annotated result to. When
+    # unset, ``push_to_hub`` uploads back to ``repo_id`` (annotate in place);
+    # when set, the source ``repo_id`` is left untouched.
+    dest_repo_id: str | None = None
+
    root: Path | None = None

    # Defaults to ``<root>/.annotate_staging/`` when unset.
@@ -182,8 +189,9 @@ class AnnotationPipelineConfig:
    skip_validation: bool = False
    only_episodes: tuple[int, ...] | None = None

-    # When True, upload the annotated dataset back to ``repo_id`` on the
-    # Hugging Face Hub. ``repo_id`` must be set for this to take effect.
+    # When True, upload the annotated dataset to the Hugging Face Hub:
+    # to ``dest_repo_id`` if set, otherwise back to ``repo_id``. One of
+    # the two must be set for this to take effect.
    push_to_hub: bool = False
    push_private: bool = False
    push_commit_message: str | None = None
@@ -113,16 +113,21 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
            logger.warning(w)

    if cfg.push_to_hub:
-        if cfg.repo_id is None:
-            raise ValueError("--push_to_hub requires --repo_id (the dataset repo to push to).")
+        if cfg.repo_id is None and cfg.dest_repo_id is None:
+            raise ValueError(
+                "--push_to_hub requires --repo_id or --dest_repo_id (the dataset repo to push to)."
+            )
        _push_to_hub(root, cfg)


 def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
-    """Upload the annotated dataset directory back to ``cfg.repo_id`` on the Hub."""
+    """Upload the annotated dataset directory to the Hub.
+
+    Pushes to ``cfg.dest_repo_id`` when set, otherwise back to ``cfg.repo_id``.
+    """
    from huggingface_hub import HfApi  # noqa: PLC0415

-    repo_id = cfg.repo_id
+    repo_id = cfg.dest_repo_id or cfg.repo_id
    commit_message = cfg.push_commit_message or "Add steerable annotations (lerobot-annotate)"
    api = HfApi()
    print(f"[lerobot-annotate] creating/locating dataset repo {repo_id}...", flush=True)