feat(eval): add hf_eval_results utility for HF leaderboard upload

Implements the missing lerobot.utils.hf_eval_results module imported by
lerobot_eval.py but never created. Provides:
- default_eval_date(): today's UTC date as ISO-8601
- build_eval_results_rows(): converts eval info dict → HF .eval_results rows
- upload_eval_results_yaml(): serialises rows and uploads to Hub model repo

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-03-20 22:41:21 -07:00
parent 3d5d8fa88a
commit 23bf69ebab
+161
View File
@@ -0,0 +1,161 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helpers for building and uploading HF-native `.eval_results` YAML rows.
The `.eval_results` format is consumed by the HuggingFace leaderboard surface.
See https://huggingface.co/docs/evaluate/en/evaluation-on-hub for the spec.
"""
from __future__ import annotations
import math
from datetime import date
from pathlib import Path
from typing import Any
import yaml
def default_eval_date() -> str:
"""Return today's UTC date as an ISO-8601 string (YYYY-MM-DD)."""
return date.today().isoformat()
def build_eval_results_rows(
*,
info: dict,
env_type: str,
env_task: str | None,
benchmark_dataset_id: str,
eval_date: str,
source_url: str | None = None,
notes: str | None = None,
) -> list[dict[str, Any]]:
"""Build a list of `.eval_results` rows from an eval info dict.
Each row represents one (task_group, metric) combination. When no
per-group breakdown is available a single overall row is emitted.
Args:
info: The dict returned by ``_aggregate_eval_from_per_task`` / ``eval_policy_all``.
Expected keys: ``overall``, optionally ``per_group``.
env_type: Environment type string (e.g. ``"libero_plus"``).
env_task: The env task string from eval config (may be ``None``).
benchmark_dataset_id: HF dataset repo-id of the consolidated benchmark dataset.
eval_date: ISO-8601 date string (use ``default_eval_date()``).
source_url: Optional URL to the evaluation run / report.
notes: Optional free-text notes about the evaluation setup.
Returns:
A list of row dicts ready for serialisation with ``upload_eval_results_yaml``.
"""
rows: list[dict[str, Any]] = []
task_name = env_task or env_type
def _safe(v: float) -> float:
return 0.0 if (v is None or (isinstance(v, float) and math.isnan(v))) else float(v)
def _make_row(config_name: str, pc_success: float, n_episodes: int) -> dict[str, Any]:
row: dict[str, Any] = {
"task": {
"type": "robotics",
"name": task_name,
},
"dataset": {
"name": benchmark_dataset_id,
"type": benchmark_dataset_id,
"config": config_name,
"split": "test",
},
"metrics": [
{
"type": "success_rate",
"value": _safe(pc_success),
"name": "Success Rate (%)",
},
{
"type": "n_episodes",
"value": n_episodes,
"name": "Number of Episodes",
},
],
"evaluated_at": eval_date,
}
if source_url:
row["source_url"] = source_url
if notes:
row["notes"] = notes
return row
per_group: dict = info.get("per_group", {})
if per_group:
for group_name, group_metrics in per_group.items():
rows.append(
_make_row(
config_name=group_name,
pc_success=group_metrics.get("pc_success", float("nan")),
n_episodes=group_metrics.get("n_episodes", 0),
)
)
else:
overall = info.get("overall", {})
rows.append(
_make_row(
config_name=env_type,
pc_success=overall.get("pc_success", float("nan")),
n_episodes=overall.get("n_episodes", 0),
)
)
return rows
def upload_eval_results_yaml(
*,
api: Any,
repo_id: str,
rows: list[dict[str, Any]],
env_type: str,
env_task: str | None,
output_dir: Path,
) -> str:
"""Serialise ``rows`` to YAML and upload to the Hub model repo.
The file is written locally to ``output_dir/eval_results.yaml`` and
then uploaded to ``eval/{env_type}/eval_results.yaml`` in ``repo_id``.
Args:
api: An instantiated ``huggingface_hub.HfApi`` object.
repo_id: HF model repo (e.g. ``"user/my_policy"``).
rows: Rows produced by ``build_eval_results_rows``.
env_type: Environment type string (used for the Hub path prefix).
env_task: The env task string (unused, kept for API symmetry).
output_dir: Local directory to write the YAML before uploading.
Returns:
URL of the Hub commit containing the uploaded file.
"""
yaml_path = Path(output_dir) / "eval_results.yaml"
yaml_path.write_text(yaml.dump({"eval_results": rows}, sort_keys=False, allow_unicode=True))
commit_url = api.upload_file(
path_or_fileobj=str(yaml_path),
path_in_repo=f"eval/{env_type}/eval_results.yaml",
repo_id=repo_id,
commit_message=f"Upload eval results YAML for {env_type}",
)
return str(commit_url)