From ac67e64ed258b98145dad4203676e72970604a79 Mon Sep 17 00:00:00 2001 From: Nicolas Rabault Date: Wed, 24 Jun 2026 11:08:47 +0200 Subject: [PATCH] refactor(jobs): check dataset presence with api.repo_exists Replace the dataset_info try/except RepositoryNotFoundError dance with a direct api.repo_exists(repo_id, repo_type="dataset") call, dropping the httpx/RepositoryNotFoundError test scaffolding. --- src/lerobot/jobs/dataset.py | 7 +------ tests/jobs/test_dataset.py | 15 ++------------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/src/lerobot/jobs/dataset.py b/src/lerobot/jobs/dataset.py index 21c978f62..efd641ab4 100644 --- a/src/lerobot/jobs/dataset.py +++ b/src/lerobot/jobs/dataset.py @@ -24,8 +24,6 @@ from __future__ import annotations import os from pathlib import Path -from huggingface_hub.errors import RepositoryNotFoundError - def ensure_dataset_available(repo_id: str, *, api, tags: list[str] | None = None) -> None: """Ensure repo_id resolves on the Hub, pushing a local-only dataset privately first. @@ -34,11 +32,8 @@ def ensure_dataset_available(repo_id: str, *, api, tags: list[str] | None = None dataset is left untouched). Raises RuntimeError if the dataset is neither on the Hub nor in the local cache. """ - try: - api.dataset_info(repo_id) + if api.repo_exists(repo_id, repo_type="dataset"): return - except RepositoryNotFoundError: - pass cache_root = Path(os.environ.get("HF_LEROBOT_HOME", "~/.cache/huggingface/lerobot")).expanduser() local_present = (cache_root / repo_id / "meta" / "info.json").is_file() diff --git a/tests/jobs/test_dataset.py b/tests/jobs/test_dataset.py index 56cf640b2..e89cc1805 100644 --- a/tests/jobs/test_dataset.py +++ b/tests/jobs/test_dataset.py @@ -15,25 +15,14 @@ import sys from unittest.mock import MagicMock -import httpx import pytest -from huggingface_hub.errors import RepositoryNotFoundError from lerobot.jobs.dataset import ensure_dataset_available -def _repo_not_found() -> RepositoryNotFoundError: - req = httpx.Request("GET", "https://huggingface.co/datasets/test") - resp = httpx.Response(404, request=req) - return RepositoryNotFoundError("nope", response=resp) - - def _api_with_dataset(exists: bool): api = MagicMock() - if exists: - api.dataset_info.return_value = object() - else: - api.dataset_info.side_effect = _repo_not_found() + api.repo_exists.return_value = exists return api @@ -48,7 +37,7 @@ def _make_local_cache(tmp_path, repo_id: str) -> None: def test_dataset_already_on_hub_is_noop(): api = _api_with_dataset(True) assert ensure_dataset_available("user/ds", api=api) is None - api.dataset_info.assert_called_once_with("user/ds") + api.repo_exists.assert_called_once_with("user/ds", repo_type="dataset") # Branch 2: not on Hub but present locally → always push privately.