feat(ci): add health dashboard Space + benchmark metrics artifacts

- spaces/health-dashboard/app.py: Gradio Space that queries the GitHub
  Actions API directly (no extra datastore). Shows benchmark status
  badges, success-rate and duration trend charts, and embeds the latest
  rollout video per benchmark. Results cached 5 min in-memory; video
  files cached on disk by artifact ID so downloads only happen once.
- spaces/health-dashboard/requirements.txt + README.md: Space card with
  setup instructions for the GITHUB_RO_TOKEN secret (actions:read,
  metadata:read only).
- scripts/ci/parse_eval_metrics.py: runs on the CI host after each eval,
  reads eval_info.json written by lerobot-eval, extracts pc_success and
  n_episodes, and writes metrics.json to the artifacts dir.
- .github/workflows/benchmark_tests.yml: add "Parse … metrics" and
  "Upload … metrics" steps (if: always()) after each eval so the
  dashboard has data even when the eval fails.

The Space should be deployed as a private Space under the huggingface
org. Required secret: GITHUB_RO_TOKEN (fine-grained, read-only).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-04-08 17:46:44 +02:00
parent 13ee7009fe
commit 452d9abaa4
5 changed files with 666 additions and 0 deletions
+34
View File
@@ -125,6 +125,15 @@ jobs:
--output_dir=/artifacts --output_dir=/artifacts
" "
- name: Parse Libero eval metrics
if: always()
run: |
python scripts/ci/parse_eval_metrics.py \
--artifacts-dir /tmp/libero-artifacts \
--env libero \
--task libero_spatial \
--policy pepijn223/smolvla_libero
- name: Upload Libero rollout video - name: Upload Libero rollout video
if: always() if: always()
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
@@ -133,6 +142,14 @@ jobs:
path: /tmp/libero-artifacts/videos/ path: /tmp/libero-artifacts/videos/
if-no-files-found: warn if-no-files-found: warn
- name: Upload Libero eval metrics
if: always()
uses: actions/upload-artifact@v4
with:
name: libero-metrics
path: /tmp/libero-artifacts/metrics.json
if-no-files-found: warn
# ── METAWORLD ───────────────────────────────────────────────────────────── # ── METAWORLD ─────────────────────────────────────────────────────────────
# Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain) # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain)
metaworld-integration-test: metaworld-integration-test:
@@ -189,6 +206,15 @@ jobs:
--output_dir=/artifacts --output_dir=/artifacts
" "
- name: Parse MetaWorld eval metrics
if: always()
run: |
python scripts/ci/parse_eval_metrics.py \
--artifacts-dir /tmp/metaworld-artifacts \
--env metaworld \
--task metaworld-push-v3 \
--policy pepijn223/smolvla_metaworld
- name: Upload MetaWorld rollout video - name: Upload MetaWorld rollout video
if: always() if: always()
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
@@ -196,3 +222,11 @@ jobs:
name: metaworld-rollout-video name: metaworld-rollout-video
path: /tmp/metaworld-artifacts/videos/ path: /tmp/metaworld-artifacts/videos/
if-no-files-found: warn if-no-files-found: warn
- name: Upload MetaWorld eval metrics
if: always()
uses: actions/upload-artifact@v4
with:
name: metaworld-metrics
path: /tmp/metaworld-artifacts/metrics.json
if-no-files-found: warn
+113
View File
@@ -0,0 +1,113 @@
#!/usr/bin/env python3
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parse lerobot-eval output into a small metrics.json artifact.
Reads eval_info.json written by lerobot-eval --output_dir and extracts the
key metrics needed by the health dashboard. Handles both single-task and
multi-task eval output formats.
Usage:
python scripts/ci/parse_eval_metrics.py \\
--artifacts-dir /tmp/libero-artifacts \\
--env libero \\
--task libero_spatial \\
--policy pepijn223/smolvla_libero
Writes <artifacts-dir>/metrics.json. The CI workflow then uploads this file
as a GitHub Actions artifact named "<env>-metrics".
"""
from __future__ import annotations
import argparse
import json
import math
import sys
from pathlib import Path
def _extract_pc_success(info: dict) -> tuple[float | None, int | None]:
"""Extract (pc_success, n_episodes) from eval_info.json.
Handles two output shapes:
- Single-task: {"aggregated": {"pc_success": 80.0, ...}}
- Multi-task: {"overall": {"pc_success": 80.0, "n_episodes": 5, ...}}
"""
# Single-task path
if "aggregated" in info:
agg = info["aggregated"]
pc = agg.get("pc_success")
n = agg.get("n_episodes") # may be absent in older format
if pc is not None and not math.isnan(pc):
return float(pc), int(n) if n is not None else None
# Multi-task path
if "overall" in info:
overall = info["overall"]
pc = overall.get("pc_success")
n = overall.get("n_episodes")
if pc is not None and not math.isnan(pc):
return float(pc), int(n) if n is not None else None
return None, None
def main() -> int:
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("--artifacts-dir", required=True, help="Path to the mounted artifacts volume")
parser.add_argument("--env", required=True, help="Environment name (e.g. libero)")
parser.add_argument("--task", required=True, help="Task name (e.g. libero_spatial)")
parser.add_argument("--policy", required=True, help="Policy hub path (e.g. pepijn223/smolvla_libero)")
args = parser.parse_args()
artifacts_dir = Path(args.artifacts_dir)
eval_info_path = artifacts_dir / "eval_info.json"
pc_success: float | None = None
n_episodes: int | None = None
if eval_info_path.exists():
try:
info = json.loads(eval_info_path.read_text())
pc_success, n_episodes = _extract_pc_success(info)
except (json.JSONDecodeError, KeyError, TypeError) as exc:
print(f"[parse_eval_metrics] Warning: could not parse eval_info.json: {exc}", file=sys.stderr)
else:
print(
f"[parse_eval_metrics] Warning: {eval_info_path} not found — eval may have failed.",
file=sys.stderr,
)
metrics = {
"env": args.env,
"task": args.task,
"policy": args.policy,
"pc_success": pc_success,
"n_episodes": n_episodes,
}
out_path = artifacts_dir / "metrics.json"
out_path.write_text(json.dumps(metrics, indent=2))
print(f"[parse_eval_metrics] Written: {out_path}")
print(json.dumps(metrics, indent=2))
return 0
if __name__ == "__main__":
sys.exit(main())
+27
View File
@@ -0,0 +1,27 @@
---
title: LeRobot Health Dashboard
emoji: 🤖
colorFrom: yellow
colorTo: orange
sdk: gradio
sdk_version: 5.29.0
app_file: app.py
pinned: true
license: apache-2.0
short_description: Live CI health for the LeRobot main branch
---
# LeRobot Health Dashboard
Internal dashboard for monitoring the health of the `main` branch — benchmark smoke-test
success rates, CI job durations, and latest rollout videos, all pulled live from the
GitHub Actions API.
## Required secret
Add `GITHUB_RO_TOKEN` in the Space settings with a fine-grained GitHub token scoped to:
- **Repository**: `huggingface/lerobot`
- **Permissions**: `Actions` → Read-only, `Metadata` → Read-only
The token is never exposed in the UI or logs.
+488
View File
@@ -0,0 +1,488 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LeRobot CI Health Dashboard.
Pulls live data from the GitHub Actions API no separate data store needed.
Benchmark smoke-test results (success rate, duration) come from a small
metrics.json artifact that each benchmark CI job uploads.
Required Space secret: GITHUB_RO_TOKEN
Fine-grained token for huggingface/lerobot with Actions=read, Metadata=read.
"""
from __future__ import annotations
import io
import json
import os
import threading
import time
import zipfile
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
import gradio as gr
import plotly.graph_objects as go
import requests # type: ignore[import-untyped]
# ── Config ────────────────────────────────────────────────────────────────────
REPO = "huggingface/lerobot"
GH_TOKEN = os.environ.get("GITHUB_RO_TOKEN", "")
CACHE_DIR = Path("/tmp/dashboard-cache") # nosec B108 — only writable path in HF Spaces
CACHE_DIR.mkdir(parents=True, exist_ok=True)
API_CACHE_TTL = 300 # 5 min — avoids hammering GitHub on every page load
# Maps CI job name fragment → display info.
# "artifact" is the actions/upload-artifact name for the rollout video.
# "metrics_artifact" is the artifact name for metrics.json.
BENCHMARKS: dict[str, dict[str, str]] = {
"libero-integration-test": {
"label": "LIBERO",
"video_artifact": "libero-rollout-video",
"metrics_artifact": "libero-metrics",
},
"metaworld-integration-test": {
"label": "MetaWorld",
"video_artifact": "metaworld-rollout-video",
"metrics_artifact": "metaworld-metrics",
},
}
WORKFLOW_LABELS: dict[str, str] = {
"Benchmark Integration Tests": "Benchmarks",
"Fast Tests": "Fast Tests",
"Full Tests": "Full Tests",
"Quality": "Quality",
"Security": "Security",
}
# ── GitHub API helpers ────────────────────────────────────────────────────────
_api_cache: dict[str, tuple[Any, float]] = {}
_api_lock = threading.Lock()
def _gh_get(path: str, **kwargs: Any) -> Any:
"""Authenticated GitHub API GET with in-memory TTL cache."""
key = path + str(kwargs)
with _api_lock:
if key in _api_cache:
val, ts = _api_cache[key]
if time.monotonic() - ts < API_CACHE_TTL:
return val
headers: dict[str, str] = {"Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28"}
if GH_TOKEN:
headers["Authorization"] = f"Bearer {GH_TOKEN}"
url = f"https://api.github.com{path}"
resp = requests.get(url, headers=headers, timeout=20, **kwargs)
resp.raise_for_status()
data = resp.json()
with _api_lock:
_api_cache[key] = (data, time.monotonic())
return data
def _gh_download(url: str) -> bytes:
"""Download a URL with auth (follows redirects, e.g. artifact zip → S3)."""
headers: dict[str, str] = {}
if GH_TOKEN:
headers["Authorization"] = f"Bearer {GH_TOKEN}"
resp = requests.get(url, headers=headers, allow_redirects=True, timeout=120)
resp.raise_for_status()
return resp.content
# ── Data fetchers ─────────────────────────────────────────────────────────────
def fetch_recent_runs(branch: str, n: int = 40) -> list[dict]:
data = _gh_get(f"/repos/{REPO}/actions/runs", params={"branch": branch, "per_page": n})
return data.get("workflow_runs", [])
def fetch_jobs(run_id: int) -> list[dict]:
# Jobs are immutable once a run completes — cache forever (use long TTL via completed_at check).
data = _gh_get(f"/repos/{REPO}/actions/runs/{run_id}/jobs", params={"per_page": 100})
return data.get("jobs", [])
def fetch_artifacts(run_id: int) -> list[dict]:
data = _gh_get(f"/repos/{REPO}/actions/runs/{run_id}/artifacts", params={"per_page": 100})
return data.get("artifacts", [])
def download_metrics_json(artifact_id: int) -> dict | None:
"""Download and parse metrics.json from a zip artifact. Caches to disk."""
cache_path = CACHE_DIR / f"metrics_{artifact_id}.json"
if cache_path.exists():
try:
return json.loads(cache_path.read_text())
except json.JSONDecodeError:
cache_path.unlink(missing_ok=True)
try:
raw = _gh_download(f"https://api.github.com/repos/{REPO}/actions/artifacts/{artifact_id}/zip")
with zipfile.ZipFile(io.BytesIO(raw)) as zf:
if "metrics.json" in zf.namelist():
data = json.loads(zf.read("metrics.json"))
cache_path.write_text(json.dumps(data))
return data
except Exception as exc:
print(f"[dashboard] Could not fetch metrics artifact {artifact_id}: {exc}")
return None
def download_video(artifact_id: int, label: str) -> Path | None:
"""Download the first .mp4 from a zip artifact. Caches to disk."""
cache_path = CACHE_DIR / f"video_{artifact_id}.mp4"
if cache_path.exists():
return cache_path
try:
raw = _gh_download(f"https://api.github.com/repos/{REPO}/actions/artifacts/{artifact_id}/zip")
with zipfile.ZipFile(io.BytesIO(raw)) as zf:
mp4s = [n for n in zf.namelist() if n.endswith(".mp4")]
if mp4s:
cache_path.write_bytes(zf.read(mp4s[0]))
return cache_path
except Exception as exc:
print(f"[dashboard] Could not fetch video artifact {artifact_id} ({label}): {exc}")
return None
# ── Data aggregation ──────────────────────────────────────────────────────────
def _job_duration_minutes(job: dict) -> float | None:
started = job.get("started_at")
completed = job.get("completed_at")
if not started or not completed:
return None
fmt = "%Y-%m-%dT%H:%M:%SZ"
try:
delta = datetime.strptime(completed, fmt) - datetime.strptime(started, fmt)
return delta.total_seconds() / 60
except ValueError:
return None
def aggregate(branch: str) -> dict:
"""Pull GitHub data and reshape into what the UI needs."""
runs = fetch_recent_runs(branch, n=40)
# Per-benchmark history (ordered newest-first from the API)
bench_history: dict[str, list[dict]] = {k: [] for k in BENCHMARKS}
# Per-workflow latest status + last few runs for the summary table
workflow_latest: dict[str, dict] = {}
for run in runs:
wf_name = run["name"]
conclusion = run["conclusion"] # "success" | "failure" | "cancelled" | None
created_at = run["created_at"]
run_id = run["id"]
run_url = run["html_url"]
# Track latest status per workflow
if wf_name not in workflow_latest:
workflow_latest[wf_name] = {
"conclusion": conclusion,
"created_at": created_at,
"run_url": run_url,
}
if wf_name != "Benchmark Integration Tests":
continue
# Drill into jobs for this benchmark run
jobs = fetch_jobs(run_id)
artifacts = fetch_artifacts(run_id)
art_by_name = {a["name"]: a for a in artifacts if not a.get("expired")}
for job in jobs:
job_name = job["name"] # e.g. "Libero — build image + 1-episode eval"
matched_key = next(
(k for k in BENCHMARKS if k in job_name.lower().replace(" ", "-")),
None,
)
if matched_key is None:
continue
info = BENCHMARKS[matched_key]
metrics: dict | None = None
if info["metrics_artifact"] in art_by_name:
metrics = download_metrics_json(art_by_name[info["metrics_artifact"]]["id"])
bench_history[matched_key].append(
{
"run_id": run_id,
"run_url": run_url,
"created_at": created_at,
"conclusion": job["conclusion"],
"duration_min": _job_duration_minutes(job),
"pc_success": metrics.get("pc_success") if metrics else None,
"n_episodes": metrics.get("n_episodes") if metrics else None,
"video_artifact_id": art_by_name.get(info["video_artifact"], {}).get("id"),
}
)
return {
"bench_history": bench_history,
"workflow_latest": workflow_latest,
"fetched_at": datetime.now(UTC).isoformat(),
}
# ── UI helpers ────────────────────────────────────────────────────────────────
_STATUS_STYLE = {
"success": ("✓ passing", "#16a34a"),
"failure": ("✗ failing", "#dc2626"),
"cancelled": ("⚠ cancelled", "#d97706"),
None: ("◌ pending", "#6b7280"),
}
def _badge(conclusion: str | None) -> str:
label, color = _STATUS_STYLE.get(conclusion, ("? unknown", "#6b7280"))
return (
f'<span style="background:{color};color:#fff;padding:1px 9px;border-radius:12px;'
f'font-size:12px;font-weight:600;font-family:monospace">{label}</span>'
)
def _fmt_date(iso: str | None) -> str:
if not iso:
return ""
return iso[:10]
def render_status_table(data: dict) -> str:
bench_history = data["bench_history"]
workflow_latest = data["workflow_latest"]
rows = []
# ── Benchmark rows ──────────────────────────────────────────────
for key, info in BENCHMARKS.items():
history = bench_history.get(key, [])
if history:
latest = history[0]
badge = _badge(latest["conclusion"])
date = _fmt_date(latest["created_at"])
pc = latest.get("pc_success")
sr_str = f"{pc:.1f}%" if pc is not None else ""
n_ep = latest.get("n_episodes") or ""
link = f'<a href="{latest["run_url"]}" target="_blank">#{latest["run_id"]}</a>'
else:
badge = _badge(None)
date = sr_str = n_ep = link = ""
rows.append(
f"<tr>"
f"<td><b>{info['label']}</b></td>"
f"<td>{badge}</td>"
f"<td>{date}</td>"
f"<td>{sr_str}</td>"
f"<td>{n_ep}</td>"
f"<td>{link}</td>"
f"</tr>"
)
# ── Other workflow rows ─────────────────────────────────────────
for wf_name, label in WORKFLOW_LABELS.items():
if wf_name == "Benchmark Integration Tests":
continue # already shown above
latest_run = workflow_latest.get(wf_name)
if latest_run:
badge = _badge(latest_run["conclusion"])
date = _fmt_date(latest_run["created_at"])
link = f'<a href="{latest_run["run_url"]}" target="_blank">run</a>'
else:
badge = _badge(None)
date = link = ""
rows.append(
f"<tr>"
f"<td><b>{label}</b></td>"
f"<td>{badge}</td>"
f"<td>{date}</td>"
f"<td>—</td><td>—</td>"
f"<td>{link}</td>"
f"</tr>"
)
header = (
"<tr style='border-bottom:1px solid #e5e7eb'>"
"<th align='left' style='padding:6px 12px'>Job</th>"
"<th align='left' style='padding:6px 12px'>Status</th>"
"<th align='left' style='padding:6px 12px'>Last run</th>"
"<th align='left' style='padding:6px 12px'>Success rate</th>"
"<th align='left' style='padding:6px 12px'>Episodes</th>"
"<th align='left' style='padding:6px 12px'>Link</th>"
"</tr>"
)
table_rows = "\n".join(rows)
return (
"<table style='width:100%;border-collapse:collapse;font-family:sans-serif;font-size:14px'>"
f"{header}{table_rows}"
"</table>"
)
def render_success_rate_chart(data: dict) -> go.Figure:
fig = go.Figure()
for key, info in BENCHMARKS.items():
history = [e for e in data["bench_history"].get(key, []) if e.get("pc_success") is not None]
if history:
fig.add_trace(
go.Scatter(
x=[e["created_at"][:10] for e in history],
y=[e["pc_success"] for e in history],
mode="lines+markers",
name=info["label"],
line={"width": 2},
marker={"size": 6},
)
)
fig.update_layout(
title="Benchmark Success Rate (%) over time",
yaxis={"title": "Success rate (%)", "range": [0, 105]},
xaxis={"title": ""},
height=320,
margin={"l": 50, "r": 20, "t": 40, "b": 40},
legend={"orientation": "h", "y": -0.15},
)
return fig
def render_duration_chart(data: dict) -> go.Figure:
fig = go.Figure()
for key, info in BENCHMARKS.items():
history = [e for e in data["bench_history"].get(key, []) if e.get("duration_min") is not None]
if history:
fig.add_trace(
go.Bar(
x=[e["created_at"][:10] for e in history],
y=[round(e["duration_min"], 1) for e in history],
name=info["label"],
opacity=0.85,
)
)
fig.update_layout(
title="Benchmark CI Duration (minutes)",
yaxis={"title": "Duration (min)"},
xaxis={"title": ""},
barmode="group",
height=320,
margin={"l": 50, "r": 20, "t": 40, "b": 40},
legend={"orientation": "h", "y": -0.15},
)
return fig
def fetch_latest_videos(data: dict) -> dict[str, str | None]:
"""Return {bench_key: local_mp4_path_or_None} for the latest successful run of each benchmark."""
results: dict[str, str | None] = {}
for key, info in BENCHMARKS.items():
history = data["bench_history"].get(key, [])
path = None
for entry in history:
art_id = entry.get("video_artifact_id")
if art_id:
downloaded = download_video(art_id, info["label"])
if downloaded:
path = str(downloaded)
break
results[key] = path
return results
# ── Gradio app ────────────────────────────────────────────────────────────────
def refresh(branch: str) -> tuple:
if not GH_TOKEN:
err = "<p style='color:red'><b>GITHUB_RO_TOKEN secret not set.</b> Add it in Space settings.</p>"
return err, go.Figure(), go.Figure(), None, None, "Error: no token"
try:
data = aggregate(branch)
except requests.HTTPError as exc:
err = f"<p style='color:red'>GitHub API error: {exc}</p>"
return err, go.Figure(), go.Figure(), None, None, str(exc)
status_html = render_status_table(data)
sr_chart = render_success_rate_chart(data)
dur_chart = render_duration_chart(data)
videos = fetch_latest_videos(data)
updated = datetime.now(UTC).strftime("Last updated: %Y-%m-%d %H:%M UTC")
bench_keys = list(BENCHMARKS.keys())
video_0 = videos.get(bench_keys[0]) if len(bench_keys) > 0 else None
video_1 = videos.get(bench_keys[1]) if len(bench_keys) > 1 else None
return status_html, sr_chart, dur_chart, video_0, video_1, updated
with gr.Blocks(title="LeRobot Health Dashboard", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"# 🤖 LeRobot — CI Health Dashboard\n"
"Live view of benchmark smoke tests, CI job health, and latest rollout videos. "
"Data pulled from the GitHub Actions API."
)
with gr.Row():
branch_dd = gr.Dropdown(
choices=["main", "feat/benchmark-ci"],
value="main",
label="Branch",
scale=1,
)
refresh_btn = gr.Button("Refresh", variant="primary", scale=0)
updated_md = gr.Markdown("Click Refresh or wait for auto-load.", scale=3)
gr.Markdown("## Status")
status_html = gr.HTML()
with gr.Row():
sr_plot = gr.Plot(label="Success Rate Trend")
dur_plot = gr.Plot(label="Duration Trend")
gr.Markdown("## Latest Rollout Videos")
bench_labels = [v["label"] for v in BENCHMARKS.values()]
with gr.Row():
video_0 = gr.Video(
label=bench_labels[0] if len(bench_labels) > 0 else "Benchmark 0", interactive=False
)
video_1 = gr.Video(
label=bench_labels[1] if len(bench_labels) > 1 else "Benchmark 1", interactive=False
)
outputs = [status_html, sr_plot, dur_plot, video_0, video_1, updated_md]
refresh_btn.click(fn=refresh, inputs=[branch_dd], outputs=outputs)
demo.load(fn=refresh, inputs=[branch_dd], outputs=outputs)
if __name__ == "__main__":
demo.launch()
+4
View File
@@ -0,0 +1,4 @@
gradio>=5.0.0,<6.0.0
plotly>=5.18.0
pandas>=2.0.0
requests>=2.31.0