mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 20:19:43 +00:00
feat(annotate): add dest_repo_id for separate push target
Adds an optional `dest_repo_id` to AnnotationPipelineConfig. When set, `push_to_hub` uploads the annotated dataset there instead of overwriting the source `repo_id`, restoring separate source/destination repos. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -20,10 +20,13 @@ Spawns one ``h200x2`` job that:
|
|||||||
1. installs this branch of ``lerobot`` plus the annotation extras,
|
1. installs this branch of ``lerobot`` plus the annotation extras,
|
||||||
2. boots two vllm servers (one per GPU) with Qwen3.6-35B-A3B-FP8,
|
2. boots two vllm servers (one per GPU) with Qwen3.6-35B-A3B-FP8,
|
||||||
3. runs the plan / interjections / vqa modules across the dataset,
|
3. runs the plan / interjections / vqa modules across the dataset,
|
||||||
4. uploads the annotated dataset back to ``--repo_id``.
|
4. uploads the annotated dataset back to ``--repo_id`` (or to
|
||||||
|
``--dest_repo_id`` when set).
|
||||||
|
|
||||||
``--repo_id`` is both the download source and, with ``--push_to_hub=true``,
|
``--repo_id`` is the download source and, with ``--push_to_hub=true``, also
|
||||||
the upload destination — the job annotates the dataset in place.
|
the default upload destination — the job annotates the dataset in place.
|
||||||
|
Pass ``--dest_repo_id`` to push the result to a separate repo instead and
|
||||||
|
leave the source untouched.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
@@ -53,9 +56,11 @@ CMD = (
|
|||||||
"export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 && "
|
"export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 && "
|
||||||
"export VLLM_VIDEO_BACKEND=pyav && "
|
"export VLLM_VIDEO_BACKEND=pyav && "
|
||||||
"lerobot-annotate "
|
"lerobot-annotate "
|
||||||
# The dataset to annotate; also the push destination (annotate in place).
|
# The dataset to annotate. By default it is also the push destination
|
||||||
|
# (annotate in place); pass --dest_repo_id to push to a separate repo.
|
||||||
"--repo_id=<your-org>/<your-dataset> "
|
"--repo_id=<your-org>/<your-dataset> "
|
||||||
"--push_to_hub=true "
|
"--push_to_hub=true "
|
||||||
|
# "--dest_repo_id=<your-org>/<your-annotated-dataset> "
|
||||||
"--vlm.backend=openai "
|
"--vlm.backend=openai "
|
||||||
"--vlm.model_id=Qwen/Qwen3.6-35B-A3B-FP8 "
|
"--vlm.model_id=Qwen/Qwen3.6-35B-A3B-FP8 "
|
||||||
"--vlm.parallel_servers=2 "
|
"--vlm.parallel_servers=2 "
|
||||||
|
|||||||
@@ -163,8 +163,15 @@ class AnnotationPipelineConfig:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Hub dataset id. Used as the download source when ``root`` is unset,
|
# Hub dataset id. Used as the download source when ``root`` is unset,
|
||||||
# and as the destination repo when ``push_to_hub`` is enabled.
|
# and as the destination repo when ``push_to_hub`` is enabled and
|
||||||
|
# ``dest_repo_id`` is unset.
|
||||||
repo_id: str | None = None
|
repo_id: str | None = None
|
||||||
|
|
||||||
|
# Optional separate Hub dataset id to push the annotated result to. When
|
||||||
|
# unset, ``push_to_hub`` uploads back to ``repo_id`` (annotate in place);
|
||||||
|
# when set, the source ``repo_id`` is left untouched.
|
||||||
|
dest_repo_id: str | None = None
|
||||||
|
|
||||||
root: Path | None = None
|
root: Path | None = None
|
||||||
|
|
||||||
# Defaults to ``<root>/.annotate_staging/`` when unset.
|
# Defaults to ``<root>/.annotate_staging/`` when unset.
|
||||||
@@ -182,8 +189,9 @@ class AnnotationPipelineConfig:
|
|||||||
skip_validation: bool = False
|
skip_validation: bool = False
|
||||||
only_episodes: tuple[int, ...] | None = None
|
only_episodes: tuple[int, ...] | None = None
|
||||||
|
|
||||||
# When True, upload the annotated dataset back to ``repo_id`` on the
|
# When True, upload the annotated dataset to the Hugging Face Hub:
|
||||||
# Hugging Face Hub. ``repo_id`` must be set for this to take effect.
|
# to ``dest_repo_id`` if set, otherwise back to ``repo_id``. One of
|
||||||
|
# the two must be set for this to take effect.
|
||||||
push_to_hub: bool = False
|
push_to_hub: bool = False
|
||||||
push_private: bool = False
|
push_private: bool = False
|
||||||
push_commit_message: str | None = None
|
push_commit_message: str | None = None
|
||||||
|
|||||||
@@ -113,16 +113,21 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
|
|||||||
logger.warning(w)
|
logger.warning(w)
|
||||||
|
|
||||||
if cfg.push_to_hub:
|
if cfg.push_to_hub:
|
||||||
if cfg.repo_id is None:
|
if cfg.repo_id is None and cfg.dest_repo_id is None:
|
||||||
raise ValueError("--push_to_hub requires --repo_id (the dataset repo to push to).")
|
raise ValueError(
|
||||||
|
"--push_to_hub requires --repo_id or --dest_repo_id (the dataset repo to push to)."
|
||||||
|
)
|
||||||
_push_to_hub(root, cfg)
|
_push_to_hub(root, cfg)
|
||||||
|
|
||||||
|
|
||||||
def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
|
def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
|
||||||
"""Upload the annotated dataset directory back to ``cfg.repo_id`` on the Hub."""
|
"""Upload the annotated dataset directory to the Hub.
|
||||||
|
|
||||||
|
Pushes to ``cfg.dest_repo_id`` when set, otherwise back to ``cfg.repo_id``.
|
||||||
|
"""
|
||||||
from huggingface_hub import HfApi # noqa: PLC0415
|
from huggingface_hub import HfApi # noqa: PLC0415
|
||||||
|
|
||||||
repo_id = cfg.repo_id
|
repo_id = cfg.dest_repo_id or cfg.repo_id
|
||||||
commit_message = cfg.push_commit_message or "Add steerable annotations (lerobot-annotate)"
|
commit_message = cfg.push_commit_message or "Add steerable annotations (lerobot-annotate)"
|
||||||
api = HfApi()
|
api = HfApi()
|
||||||
print(f"[lerobot-annotate] creating/locating dataset repo {repo_id}...", flush=True)
|
print(f"[lerobot-annotate] creating/locating dataset repo {repo_id}...", flush=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user