From a764c3e1d671f102bb8d565b3b6b4758c9060a93 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 5 May 2026 18:23:18 +0200 Subject: [PATCH] fix(datasets,annotate): tag pushed dataset + clean revision error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs combining to make the brand-new ``_tool3`` dataset unloadable: 1. ``lerobot_annotate.py:_push_to_hub`` uploads the annotated dataset folder but never creates a codebase-version tag, so ``api/datasets//refs`` returns ``"tags": []``. Then ``LeRobotDatasetMetadata`` → ``get_safe_version`` → ``get_repo_versions`` returns empty and the loader raises ``RevisionNotFoundError``. 2. ``RevisionNotFoundError`` itself was unconstructible: its ``HfHubHTTPError.__init__`` indexes ``response.headers`` unconditionally on current ``huggingface_hub`` versions, so constructing it without a real ``Response`` blew up with ``AttributeError: 'NoneType' object has no attribute 'headers'``, masking the real "no tag" message. Fix #1: after upload, read ``meta/info.json["codebase_version"]`` and ``HfApi.create_tag(..., tag=, repo_type='dataset', exist_ok=True)`` so the dataset is loadable straight from the Hub on the next ``LeRobotDataset(repo_id)`` call. Falls back to the in-tree ``CODEBASE_VERSION`` if info.json is missing/malformed; on tag creation failure, prints the manual one-liner the user needs. Fix #2: stop trying to instantiate ``RevisionNotFoundError`` (which inherits HfHubHTTPError) for what is really a config issue, not an HTTP failure. Raise plain ``RuntimeError`` with the same message — the caller actually sees what's wrong instead of an upstream attribute error. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lerobot/datasets/utils.py | 33 +++++++++++----------- src/lerobot/scripts/lerobot_annotate.py | 37 +++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index 8cbf89fa8..d296554d6 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -238,24 +238,23 @@ def get_safe_version(repo_id: str, version: str | packaging.version.Version) -> if not hub_versions: msg = ( - f"Repo {repo_id!r} has no codebase-version tags. " - f"Either the dataset doesn't exist on the Hub yet, or it was " - f"pushed without a version tag. To tag an existing dataset:\n" - f"```python\n" - f"from huggingface_hub import HfApi\n" - f"HfApi().create_tag({repo_id!r}, tag='_version_', repo_type='dataset')\n" - f"```" + f"Repo {repo_id!r} has no codebase-version tags. The dataset " + f"either doesn't exist on the Hub yet, or it was uploaded " + f"without a ``v3.x``-style tag. To tag an existing dataset run:\n" + f" from huggingface_hub import HfApi\n" + f" HfApi().create_tag({repo_id!r}, tag='v3.0', repo_type='dataset', exist_ok=True)" ) - # ``RevisionNotFoundError`` extends ``HfHubHTTPError`` which on - # newer ``huggingface_hub`` versions makes ``response`` a required - # keyword arg. Pass ``response=None`` explicitly so this raises - # with a clean message instead of an upstream - # ``TypeError: __init__() missing 1 required keyword-only argument: 'response'``. - try: - raise RevisionNotFoundError(msg, response=None) - except TypeError: - # Older ``huggingface_hub`` (no ``response`` kwarg). - raise RevisionNotFoundError(msg) # noqa: B904 + # ``RevisionNotFoundError`` extends ``HfHubHTTPError`` whose + # ``__init__`` indexes ``response.headers`` unconditionally on + # current ``huggingface_hub`` versions. Constructing it without + # a real ``Response`` object crashes with either + # ``TypeError: missing 1 required keyword-only argument`` (old + # builds) or ``AttributeError: 'NoneType' object has no attribute + # 'headers'`` (new builds). Skip that path entirely — this isn't + # really an HTTP error, it's a configuration issue — and raise a + # plain ``RuntimeError`` so the message actually reaches the + # caller. + raise RuntimeError(msg) if target_version in hub_versions: return f"v{target_version}" diff --git a/src/lerobot/scripts/lerobot_annotate.py b/src/lerobot/scripts/lerobot_annotate.py index 61148b1a4..b58ea26a2 100644 --- a/src/lerobot/scripts/lerobot_annotate.py +++ b/src/lerobot/scripts/lerobot_annotate.py @@ -141,6 +141,43 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None: ) print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True) + # Tag the upload with the codebase version. ``LeRobotDatasetMetadata`` + # resolves the dataset revision via ``get_safe_version`` which scans + # for tags like ``v3.0``; without a tag it raises + # ``RevisionNotFoundError``. Read the version straight from the + # dataset's own ``meta/info.json`` so we tag whatever the writer + # actually wrote (no accidental drift if the codebase floor moves). + from lerobot.datasets.dataset_metadata import CODEBASE_VERSION # noqa: PLC0415 + + info_path = root / "meta" / "info.json" + version_tag = CODEBASE_VERSION + if info_path.exists(): + try: + from lerobot.utils.io_utils import load_json # noqa: PLC0415 + + info = load_json(info_path) + ds_version = info.get("codebase_version") + if isinstance(ds_version, str) and ds_version.startswith("v"): + version_tag = ds_version + except Exception as exc: # noqa: BLE001 + print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True) + try: + api.create_tag( + repo_id=repo_id, + tag=version_tag, + repo_type="dataset", + exist_ok=True, + ) + print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True) + except Exception as exc: # noqa: BLE001 + print( + f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. " + "Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. " + "Run: from huggingface_hub import HfApi; " + f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)", + flush=True, + ) + def main() -> None: annotate()