refactor(jobs): check dataset presence with api.repo_exists

Replace the dataset_info try/except RepositoryNotFoundError dance with a
direct api.repo_exists(repo_id, repo_type="dataset") call, dropping the
httpx/RepositoryNotFoundError test scaffolding.
This commit is contained in:
Nicolas Rabault
2026-06-24 11:08:47 +02:00
parent 62badcba4e
commit ac67e64ed2
2 changed files with 3 additions and 19 deletions
+1 -6
View File
@@ -24,8 +24,6 @@ from __future__ import annotations
import os
from pathlib import Path
from huggingface_hub.errors import RepositoryNotFoundError
def ensure_dataset_available(repo_id: str, *, api, tags: list[str] | None = None) -> None:
"""Ensure repo_id resolves on the Hub, pushing a local-only dataset privately first.
@@ -34,11 +32,8 @@ def ensure_dataset_available(repo_id: str, *, api, tags: list[str] | None = None
dataset is left untouched). Raises RuntimeError if the dataset is neither on
the Hub nor in the local cache.
"""
try:
api.dataset_info(repo_id)
if api.repo_exists(repo_id, repo_type="dataset"):
return
except RepositoryNotFoundError:
pass
cache_root = Path(os.environ.get("HF_LEROBOT_HOME", "~/.cache/huggingface/lerobot")).expanduser()
local_present = (cache_root / repo_id / "meta" / "info.json").is_file()
+2 -13
View File
@@ -15,25 +15,14 @@
import sys
from unittest.mock import MagicMock
import httpx
import pytest
from huggingface_hub.errors import RepositoryNotFoundError
from lerobot.jobs.dataset import ensure_dataset_available
def _repo_not_found() -> RepositoryNotFoundError:
req = httpx.Request("GET", "https://huggingface.co/datasets/test")
resp = httpx.Response(404, request=req)
return RepositoryNotFoundError("nope", response=resp)
def _api_with_dataset(exists: bool):
api = MagicMock()
if exists:
api.dataset_info.return_value = object()
else:
api.dataset_info.side_effect = _repo_not_found()
api.repo_exists.return_value = exists
return api
@@ -48,7 +37,7 @@ def _make_local_cache(tmp_path, repo_id: str) -> None:
def test_dataset_already_on_hub_is_noop():
api = _api_with_dataset(True)
assert ensure_dataset_available("user/ds", api=api) is None
api.dataset_info.assert_called_once_with("user/ds")
api.repo_exists.assert_called_once_with("user/ds", repo_type="dataset")
# Branch 2: not on Hub but present locally → always push privately.