fix(datasets,annotate): tag pushed dataset + clean revision error

Two bugs combining to make the brand-new ``_tool3`` dataset
unloadable:

1. ``lerobot_annotate.py:_push_to_hub`` uploads the annotated
   dataset folder but never creates a codebase-version tag, so
   ``api/datasets/<repo>/refs`` returns ``"tags": []``. Then
   ``LeRobotDatasetMetadata`` → ``get_safe_version`` →
   ``get_repo_versions`` returns empty and the loader raises
   ``RevisionNotFoundError``.

2. ``RevisionNotFoundError`` itself was unconstructible: its
   ``HfHubHTTPError.__init__`` indexes ``response.headers``
   unconditionally on current ``huggingface_hub`` versions, so
   constructing it without a real ``Response`` blew up with
   ``AttributeError: 'NoneType' object has no attribute 'headers'``,
   masking the real "no tag" message.

Fix #1: after upload, read ``meta/info.json["codebase_version"]`` and
``HfApi.create_tag(..., tag=<v3.x>, repo_type='dataset',
exist_ok=True)`` so the dataset is loadable straight from the Hub on
the next ``LeRobotDataset(repo_id)`` call. Falls back to the in-tree
``CODEBASE_VERSION`` if info.json is missing/malformed; on tag
creation failure, prints the manual one-liner the user needs.

Fix #2: stop trying to instantiate ``RevisionNotFoundError`` (which
inherits HfHubHTTPError) for what is really a config issue, not an
HTTP failure. Raise plain ``RuntimeError`` with the same message —
the caller actually sees what's wrong instead of an upstream
attribute error.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-05 18:23:18 +02:00
parent b416f287f2
commit a764c3e1d6
2 changed files with 53 additions and 17 deletions
+16 -17
View File
@@ -238,24 +238,23 @@ def get_safe_version(repo_id: str, version: str | packaging.version.Version) ->
if not hub_versions:
msg = (
f"Repo {repo_id!r} has no codebase-version tags. "
f"Either the dataset doesn't exist on the Hub yet, or it was "
f"pushed without a version tag. To tag an existing dataset:\n"
f"```python\n"
f"from huggingface_hub import HfApi\n"
f"HfApi().create_tag({repo_id!r}, tag='_version_', repo_type='dataset')\n"
f"```"
f"Repo {repo_id!r} has no codebase-version tags. The dataset "
f"either doesn't exist on the Hub yet, or it was uploaded "
f"without a ``v3.x``-style tag. To tag an existing dataset run:\n"
f" from huggingface_hub import HfApi\n"
f" HfApi().create_tag({repo_id!r}, tag='v3.0', repo_type='dataset', exist_ok=True)"
)
# ``RevisionNotFoundError`` extends ``HfHubHTTPError`` which on
# newer ``huggingface_hub`` versions makes ``response`` a required
# keyword arg. Pass ``response=None`` explicitly so this raises
# with a clean message instead of an upstream
# ``TypeError: __init__() missing 1 required keyword-only argument: 'response'``.
try:
raise RevisionNotFoundError(msg, response=None)
except TypeError:
# Older ``huggingface_hub`` (no ``response`` kwarg).
raise RevisionNotFoundError(msg) # noqa: B904
# ``RevisionNotFoundError`` extends ``HfHubHTTPError`` whose
# ``__init__`` indexes ``response.headers`` unconditionally on
# current ``huggingface_hub`` versions. Constructing it without
# a real ``Response`` object crashes with either
# ``TypeError: missing 1 required keyword-only argument`` (old
# builds) or ``AttributeError: 'NoneType' object has no attribute
# 'headers'`` (new builds). Skip that path entirely — this isn't
# really an HTTP error, it's a configuration issue — and raise a
# plain ``RuntimeError`` so the message actually reaches the
# caller.
raise RuntimeError(msg)
if target_version in hub_versions:
return f"v{target_version}"
+37
View File
@@ -141,6 +141,43 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
)
print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
# Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
# resolves the dataset revision via ``get_safe_version`` which scans
# for tags like ``v3.0``; without a tag it raises
# ``RevisionNotFoundError``. Read the version straight from the
# dataset's own ``meta/info.json`` so we tag whatever the writer
# actually wrote (no accidental drift if the codebase floor moves).
from lerobot.datasets.dataset_metadata import CODEBASE_VERSION # noqa: PLC0415
info_path = root / "meta" / "info.json"
version_tag = CODEBASE_VERSION
if info_path.exists():
try:
from lerobot.utils.io_utils import load_json # noqa: PLC0415
info = load_json(info_path)
ds_version = info.get("codebase_version")
if isinstance(ds_version, str) and ds_version.startswith("v"):
version_tag = ds_version
except Exception as exc: # noqa: BLE001
print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
try:
api.create_tag(
repo_id=repo_id,
tag=version_tag,
repo_type="dataset",
exist_ok=True,
)
print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
except Exception as exc: # noqa: BLE001
print(
f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
"Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
"Run: from huggingface_hub import HfApi; "
f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
flush=True,
)
def main() -> None:
annotate()