mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-11 22:59:50 +00:00
Compare commits
45 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a23ebf9d35 | |||
| bfff81fd4b | |||
| 929400cd44 | |||
| fe78f8fee9 | |||
| ce9bfa754d | |||
| b86935c64b | |||
| a2f72e42f6 | |||
| a515eadc96 | |||
| 8d982614a6 | |||
| c8df80ae91 | |||
| 1ac8e96575 | |||
| a6dd28e8b4 | |||
| 1842100402 | |||
| 00e9defb80 | |||
| b81eef43c8 | |||
| d483dd4c4b | |||
| a56423fa33 | |||
| da7da741f1 | |||
| b1e16783de | |||
| a4544ffea7 | |||
| dbe01b0444 | |||
| e16a95a78e | |||
| 4137b5785d | |||
| 8ece10e484 | |||
| ddeb216ab9 | |||
| d46d67f75d | |||
| b746cd3c61 | |||
| 6d1a5fca02 | |||
| 8d7099cd7d | |||
| 516f39685a | |||
| b27e838376 | |||
| 40470648d1 | |||
| 25e5062b2c | |||
| 35e3b28da1 | |||
| ed8a98dda6 | |||
| 9dc38d9993 | |||
| 3922f81791 | |||
| 28e8483297 | |||
| e1b22ed1c4 | |||
| f2d0f04dd0 | |||
| 3ea722c6c0 | |||
| 48660e7a7c | |||
| c94fe868c9 | |||
| d4f27cfb6e | |||
| 1a2aec1b04 |
@@ -0,0 +1,237 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Model Profiling
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 0 * * 0"
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- .github/workflows/model_profiling.yml
|
||||
- src/lerobot/configs/train.py
|
||||
- src/lerobot/scripts/lerobot_train.py
|
||||
- src/lerobot/utils/model_profiling.py
|
||||
- tests/test_model_profiling.py
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
git_ref:
|
||||
description: Git ref to profile when no commit SHA is provided
|
||||
required: false
|
||||
type: string
|
||||
default: main
|
||||
git_commit:
|
||||
description: Optional exact commit SHA to profile
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
policies:
|
||||
description: Optional comma-separated policy filter
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
profile_mode:
|
||||
description: Torch profiler mode
|
||||
required: false
|
||||
type: choice
|
||||
options:
|
||||
- trace
|
||||
- summary
|
||||
default: trace
|
||||
publish_results:
|
||||
description: Publish results to the profiling dataset when a Hub token is available
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
results_repo:
|
||||
description: Dataset repo name or fully qualified repo id
|
||||
required: false
|
||||
type: string
|
||||
default: model-profiling-history
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.inputs.git_commit || github.event.inputs.git_ref || github.ref_name || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
profile-models:
|
||||
name: Weekly Model Profiling
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
PROFILE_MODE: ${{ github.event_name == 'pull_request' && 'summary' || github.event.inputs.profile_mode || 'trace' }}
|
||||
POLICY_FILTER: ${{ github.event_name == 'pull_request' && 'act,diffusion,pi0,pi05,smolvla,groot,xvla,wall_x' || github.event.inputs.policies || '' }}
|
||||
RESULTS_REPO: ${{ github.event.inputs.results_repo || 'model-profiling-history' }}
|
||||
SHOULD_PUBLISH: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_results == 'true') }}
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
ref: ${{ github.event.pull_request.head.sha || github.event.inputs.git_commit || github.event.inputs.git_ref || 'main' }}
|
||||
|
||||
- name: Pull GPU image
|
||||
run: docker pull huggingface/lerobot-gpu:latest
|
||||
|
||||
- name: Run model profiling
|
||||
env:
|
||||
HOST_GIT_COMMIT: ${{ github.event.pull_request.head.sha || github.event.inputs.git_commit || github.sha }}
|
||||
PROFILE_GIT_REF: ${{ github.head_ref || github.ref_name || github.event.inputs.git_ref || 'main' }}
|
||||
PROFILE_PR_NUMBER: ${{ github.event.pull_request.number || '' }}
|
||||
run: |
|
||||
set -eux
|
||||
mkdir -p profiling-results
|
||||
docker run --rm --gpus all \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--shm-size=16g \
|
||||
-e HOME=/tmp/lerobot-home \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_LEROBOT_HOME=/tmp/hf-lerobot \
|
||||
-e TORCH_HOME=/tmp/torch-home \
|
||||
-e TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor-cache \
|
||||
-e UV_PROJECT_ENVIRONMENT=/tmp/lerobot-venv \
|
||||
-e UV_CACHE_DIR=/tmp/uv-cache \
|
||||
-e UV_PYTHON_PREFERENCE=only-system \
|
||||
-e XDG_DATA_HOME=/tmp/xdg-data \
|
||||
-e XDG_CACHE_HOME=/tmp/xdg-cache \
|
||||
-e HOST_GIT_COMMIT="${HOST_GIT_COMMIT}" \
|
||||
-e PROFILE_GIT_REF="${PROFILE_GIT_REF}" \
|
||||
-e PROFILE_PR_NUMBER="${PROFILE_PR_NUMBER}" \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e PROFILE_MODE="${PROFILE_MODE}" \
|
||||
-e POLICY_FILTER="${POLICY_FILTER}" \
|
||||
-e RESULTS_REPO="${RESULTS_REPO}" \
|
||||
-e SHOULD_PUBLISH="${SHOULD_PUBLISH}" \
|
||||
-v "${GITHUB_WORKSPACE}:/workspace" \
|
||||
-w /workspace \
|
||||
huggingface/lerobot-gpu:latest \
|
||||
bash -c '
|
||||
set -euxo pipefail
|
||||
mkdir -p "${HOME}" "${HF_HOME}" "${HF_LEROBOT_HOME}" "${TORCH_HOME}" "${UV_CACHE_DIR}" "${XDG_CACHE_HOME}" "${XDG_DATA_HOME}" "${TORCHINDUCTOR_CACHE_DIR}"
|
||||
rm -rf /tmp/lerobot-src
|
||||
cp -a /workspace/. /tmp/lerobot-src
|
||||
cd /tmp/lerobot-src
|
||||
|
||||
if [[ -n "${HF_USER_TOKEN:-}" ]]; then
|
||||
hf auth login --token "${HF_USER_TOKEN}" --add-to-git-credential 2>/dev/null || true
|
||||
fi
|
||||
|
||||
policies_to_run=()
|
||||
if [[ -n "${POLICY_FILTER}" ]]; then
|
||||
IFS="," read -ra policies_to_run <<< "${POLICY_FILTER}"
|
||||
else
|
||||
policies_to_run=(act diffusion groot multi_task_dit pi0 pi0_fast pi05 smolvla wall_x xvla)
|
||||
fi
|
||||
|
||||
policy_extras() {
|
||||
case "$1" in
|
||||
act) ;;
|
||||
diffusion) echo "diffusion" ;;
|
||||
groot) echo "groot" ;;
|
||||
multi_task_dit) echo "multi_task_dit" ;;
|
||||
pi0|pi0_fast|pi05) echo "pi" ;;
|
||||
smolvla) echo "smolvla" ;;
|
||||
wall_x) echo "wallx" ;;
|
||||
xvla) echo "xvla" ;;
|
||||
*)
|
||||
echo "Unknown profiling policy $1" >&2
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Policies whose dep-install may fail due to environment constraints
|
||||
# (e.g. groot requires compiling flash-attn, which needs nvcc; the CI
|
||||
# image only ships the CUDA runtime). Install failures for these are
|
||||
# logged as warnings and do not fail the job. See the TODO next to
|
||||
# `lerobot[groot]` in pyproject.toml.
|
||||
is_install_failure_tolerated() {
|
||||
case "$1" in
|
||||
groot) return 0 ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
overall_status=0
|
||||
for raw_policy in "${policies_to_run[@]}"; do
|
||||
policy="$(echo "${raw_policy}" | xargs)"
|
||||
[[ -z "${policy}" ]] && continue
|
||||
|
||||
echo "::group::Profile ${policy}"
|
||||
|
||||
extra="$(policy_extras "${policy}")" || { overall_status=1; echo "::endgroup::"; continue; }
|
||||
|
||||
# Fresh, isolated dependency resolution per policy so that
|
||||
# incompatible extras (e.g. flash-attn for groot) never block
|
||||
# the rest of the matrix.
|
||||
sync_cmd=(uv sync --locked --extra training --extra test)
|
||||
if [[ -n "${extra}" ]]; then
|
||||
sync_cmd+=(--extra "${extra}")
|
||||
fi
|
||||
# flash-attn does not declare torch as a build-time dep, so its
|
||||
# isolated build env fails with ModuleNotFoundError. Torch is a
|
||||
# core lerobot dep and is already resolved here, so we disable
|
||||
# build isolation for flash-attn specifically.
|
||||
sync_cmd+=(--no-build-isolation-package flash-attn)
|
||||
if ! "${sync_cmd[@]}"; then
|
||||
if is_install_failure_tolerated "${policy}"; then
|
||||
echo "::warning::Dependency install failed for ${policy} (known-fragile); skipping."
|
||||
else
|
||||
echo "Dependency install failed for ${policy}; skipping." >&2
|
||||
overall_status=1
|
||||
fi
|
||||
echo "::endgroup::"
|
||||
continue
|
||||
fi
|
||||
|
||||
cmd=(
|
||||
uv run python -m lerobot.utils.model_profiling
|
||||
--output_dir=/workspace/profiling-results
|
||||
--hub_org=lerobot
|
||||
--results_repo="${RESULTS_REPO}"
|
||||
--profile_mode="${PROFILE_MODE}"
|
||||
--git_commit="${HOST_GIT_COMMIT}"
|
||||
--git_ref="${PROFILE_GIT_REF}"
|
||||
--pr_number="${PROFILE_PR_NUMBER}"
|
||||
--policies "${policy}"
|
||||
)
|
||||
if [[ "${SHOULD_PUBLISH}" == "true" && -n "${HF_USER_TOKEN:-}" ]]; then
|
||||
cmd+=(--publish)
|
||||
fi
|
||||
|
||||
if ! "${cmd[@]}"; then
|
||||
echo "Profiling failed for ${policy}." >&2
|
||||
overall_status=1
|
||||
fi
|
||||
|
||||
echo "::endgroup::"
|
||||
done
|
||||
|
||||
exit "${overall_status}"
|
||||
'
|
||||
|
||||
- name: Upload profiling artifacts
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: model-profiling-results
|
||||
path: profiling-results
|
||||
if-no-files-found: warn
|
||||
@@ -16,7 +16,7 @@ import datetime as dt
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Any, Literal
|
||||
|
||||
import draccus
|
||||
from huggingface_hub import hf_hub_download
|
||||
@@ -58,6 +58,8 @@ class TrainPipelineConfig(HubMixin):
|
||||
batch_size: int = 8
|
||||
prefetch_factor: int = 4
|
||||
persistent_workers: bool = True
|
||||
profile_mode: Literal["off", "summary", "trace"] = "off"
|
||||
profile_output_dir: Path | None = None
|
||||
steps: int = 100_000
|
||||
eval_freq: int = 20_000
|
||||
log_freq: int = 200
|
||||
@@ -130,9 +132,15 @@ class TrainPipelineConfig(HubMixin):
|
||||
now = dt.datetime.now()
|
||||
train_dir = f"{now:%Y-%m-%d}/{now:%H-%M-%S}_{self.job_name}"
|
||||
self.output_dir = Path("outputs/train") / train_dir
|
||||
if self.profile_mode != "off" and self.profile_output_dir is None:
|
||||
self.profile_output_dir = self.output_dir / "profiling"
|
||||
|
||||
if isinstance(self.dataset.repo_id, list):
|
||||
raise NotImplementedError("LeRobotMultiDataset is not currently implemented.")
|
||||
if self.profile_mode not in {"off", "summary", "trace"}:
|
||||
raise ValueError(
|
||||
f"`profile_mode` must be one of 'off', 'summary', or 'trace', got {self.profile_mode}."
|
||||
)
|
||||
|
||||
if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
|
||||
raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
|
||||
|
||||
@@ -655,7 +655,6 @@ class VLAFlowMatching(nn.Module):
|
||||
pad_masks.append(image_start_mask)
|
||||
|
||||
img_emb = self.vlm_with_expert.embed_image(img)
|
||||
img_emb = img_emb
|
||||
|
||||
# Normalize image embeddings
|
||||
img_emb_dim = img_emb.shape[-1]
|
||||
|
||||
@@ -49,6 +49,7 @@ from lerobot.optim.factory import make_optimizer_and_scheduler
|
||||
from lerobot.policies import PreTrainedPolicy, make_policy, make_pre_post_processors
|
||||
from lerobot.utils.import_utils import register_third_party_plugins
|
||||
from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
|
||||
from lerobot.utils.model_profiling import TrainingProfiler
|
||||
from lerobot.utils.random_utils import set_seed
|
||||
from lerobot.utils.utils import (
|
||||
cycle,
|
||||
@@ -71,6 +72,7 @@ def update_policy(
|
||||
lr_scheduler=None,
|
||||
lock=None,
|
||||
rabc_weights_provider=None,
|
||||
profiler: "TrainingProfiler | None" = None,
|
||||
) -> tuple[MetricsTracker, dict]:
|
||||
"""
|
||||
Performs a single training step to update the policy's weights.
|
||||
@@ -103,8 +105,10 @@ def update_policy(
|
||||
if rabc_weights_provider is not None:
|
||||
rabc_batch_weights, rabc_batch_stats = rabc_weights_provider.compute_batch_weights(batch)
|
||||
|
||||
# Let accelerator handle mixed precision
|
||||
with accelerator.autocast():
|
||||
def _section(name: str) -> Any:
|
||||
return profiler.section(name) if profiler is not None else nullcontext()
|
||||
|
||||
with _section("forward"), accelerator.autocast():
|
||||
# Use per-sample loss when RA-BC is enabled for proper weighting
|
||||
if rabc_batch_weights is not None:
|
||||
# Get per-sample losses
|
||||
@@ -123,8 +127,8 @@ def update_policy(
|
||||
|
||||
# TODO(rcadene): policy.unnormalize_outputs(out_dict)
|
||||
|
||||
# Use accelerator's backward method
|
||||
accelerator.backward(loss)
|
||||
with _section("backward"):
|
||||
accelerator.backward(loss)
|
||||
|
||||
# Clip gradients if specified
|
||||
if grad_clip_norm > 0:
|
||||
@@ -134,8 +138,7 @@ def update_policy(
|
||||
policy.parameters(), float("inf"), error_if_nonfinite=False
|
||||
)
|
||||
|
||||
# Optimizer step
|
||||
with lock if lock is not None else nullcontext():
|
||||
with _section("optimizer"), lock if lock is not None else nullcontext():
|
||||
optimizer.step()
|
||||
|
||||
optimizer.zero_grad()
|
||||
@@ -316,6 +319,15 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
logging.info("Creating optimizer and scheduler")
|
||||
optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
|
||||
|
||||
profiler = (
|
||||
TrainingProfiler.from_cfg(cfg, device) if cfg.profile_mode != "off" and is_main_process else None
|
||||
)
|
||||
if profiler:
|
||||
profiler.record_deterministic_forward(
|
||||
policy=policy, dataset=dataset, batch_size=cfg.batch_size, preprocessor=preprocessor
|
||||
)
|
||||
profiler.start()
|
||||
|
||||
# Load precomputed SARM progress for RA-BC if enabled
|
||||
# Generate progress using: src/lerobot/policies/sarm/compute_rabc_weights.py
|
||||
rabc_weights = None
|
||||
@@ -449,6 +461,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
accelerator=accelerator,
|
||||
lr_scheduler=lr_scheduler,
|
||||
rabc_weights_provider=rabc_weights,
|
||||
profiler=profiler,
|
||||
)
|
||||
|
||||
# Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
|
||||
@@ -456,6 +469,8 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
step += 1
|
||||
if is_main_process:
|
||||
progbar.update(1)
|
||||
if profiler:
|
||||
profiler.step(step, train_tracker)
|
||||
train_tracker.step()
|
||||
is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 and is_main_process
|
||||
is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
|
||||
@@ -551,6 +566,8 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
|
||||
if is_main_process:
|
||||
progbar.close()
|
||||
if profiler:
|
||||
profiler.finalize()
|
||||
|
||||
if eval_env:
|
||||
close_envs(eval_env)
|
||||
|
||||
@@ -0,0 +1,783 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
"""Model profiling — single-file entry point.
|
||||
|
||||
Contains three things that used to live in three separate files:
|
||||
|
||||
* `TrainingProfiler` — hooks the training loop. Captures per-step
|
||||
forward/backward/optimizer timings, the torch profiler output, and a
|
||||
deterministic-forward fingerprint for regression detection.
|
||||
* `POLICY_SPECS` — CI matrix of `policy_name → (steps, train_args)`.
|
||||
Inline so there is no separate JSON to keep in sync.
|
||||
* `main()` — CI orchestrator. For each selected policy, spawns a
|
||||
`lerobot-train` subprocess with profiling enabled, collects the
|
||||
artifacts, and (optionally) publishes a row to a HF Hub dataset.
|
||||
|
||||
Usage (CI):
|
||||
|
||||
python -m lerobot.utils.model_profiling \
|
||||
--output_dir=./profiling-results \
|
||||
--policies act diffusion \
|
||||
--profile_mode=trace \
|
||||
--publish
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
import statistics
|
||||
import subprocess
|
||||
import time
|
||||
from collections.abc import Iterator
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
from numbers import Real
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
from huggingface_hub import CommitOperationAdd, HfApi
|
||||
from huggingface_hub.errors import HfHubHTTPError
|
||||
from torch.utils.data import default_collate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Policy matrix. Same shape as the former JSON file; inlined so the source
|
||||
# tree has one less file to keep in sync with the training args.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_LIBERO_RENAME_BASE_RGB = (
|
||||
'--rename_map={"observation.images.front": "observation.images.base_0_rgb", '
|
||||
'"observation.images.wrist": "observation.images.left_wrist_0_rgb"}'
|
||||
)
|
||||
_LIBERO_RENAME_CAMERAS = (
|
||||
'--rename_map={"observation.images.front": "observation.images.camera1", '
|
||||
'"observation.images.wrist": "observation.images.camera2"}'
|
||||
)
|
||||
_PI_SGD = [
|
||||
"--use_policy_training_preset=false",
|
||||
"--optimizer.type=sgd",
|
||||
"--optimizer.lr=1e-5",
|
||||
"--optimizer.weight_decay=0",
|
||||
"--optimizer.grad_clip_norm=1.0",
|
||||
"--scheduler.type=cosine_decay_with_warmup",
|
||||
"--scheduler.peak_lr=1e-5",
|
||||
"--scheduler.decay_lr=1e-6",
|
||||
"--scheduler.num_warmup_steps=0",
|
||||
"--scheduler.num_decay_steps=12",
|
||||
]
|
||||
|
||||
|
||||
POLICY_SPECS: dict[str, dict[str, Any]] = {
|
||||
"act": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/pusht",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.type=act",
|
||||
"--policy.device=cuda",
|
||||
"--batch_size=4",
|
||||
"--cudnn_deterministic=true",
|
||||
],
|
||||
},
|
||||
"diffusion": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/pusht",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.type=diffusion",
|
||||
"--policy.device=cuda",
|
||||
"--batch_size=4",
|
||||
"--cudnn_deterministic=true",
|
||||
],
|
||||
},
|
||||
"groot": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/libero_plus",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.type=groot",
|
||||
"--policy.base_model_path=nvidia/GR00T-N1.5-3B",
|
||||
"--policy.tune_diffusion_model=true",
|
||||
"--policy.tune_projector=true",
|
||||
"--policy.tune_llm=false",
|
||||
"--policy.tune_visual=false",
|
||||
"--policy.use_bf16=true",
|
||||
"--policy.device=cuda",
|
||||
"--batch_size=1",
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", '
|
||||
'"observation.images.image2": "observation.images.camera2"}',
|
||||
],
|
||||
},
|
||||
"multi_task_dit": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/pusht",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.type=multi_task_dit",
|
||||
"--policy.device=cuda",
|
||||
"--policy.horizon=32",
|
||||
"--policy.n_action_steps=30",
|
||||
"--batch_size=4",
|
||||
"--cudnn_deterministic=true",
|
||||
],
|
||||
},
|
||||
"pi0": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/libero_plus",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.path=lerobot/pi0_base",
|
||||
"--policy.device=cuda",
|
||||
"--policy.dtype=bfloat16",
|
||||
"--policy.n_action_steps=30",
|
||||
"--policy.use_amp=true",
|
||||
"--policy.gradient_checkpointing=true",
|
||||
"--batch_size=1",
|
||||
*_PI_SGD,
|
||||
_LIBERO_RENAME_BASE_RGB,
|
||||
],
|
||||
},
|
||||
"pi0_fast": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/libero_plus",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.path=lerobot/pi0fast-base",
|
||||
"--policy.device=cuda",
|
||||
"--policy.dtype=bfloat16",
|
||||
"--policy.n_action_steps=30",
|
||||
"--policy.use_amp=true",
|
||||
"--policy.gradient_checkpointing=true",
|
||||
"--batch_size=1",
|
||||
*_PI_SGD,
|
||||
_LIBERO_RENAME_BASE_RGB,
|
||||
],
|
||||
},
|
||||
"pi05": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/libero_plus",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.path=lerobot/pi05_base",
|
||||
"--policy.device=cuda",
|
||||
"--policy.dtype=bfloat16",
|
||||
"--policy.n_action_steps=30",
|
||||
"--policy.use_amp=true",
|
||||
"--policy.gradient_checkpointing=true",
|
||||
"--batch_size=1",
|
||||
*_PI_SGD,
|
||||
'--policy.normalization_mapping={"ACTION": "MEAN_STD", '
|
||||
'"STATE": "MEAN_STD", "VISUAL": "IDENTITY"}',
|
||||
_LIBERO_RENAME_BASE_RGB,
|
||||
],
|
||||
},
|
||||
"smolvla": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/libero_plus",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.path=lerobot/smolvla_base",
|
||||
"--policy.load_vlm_weights=true",
|
||||
"--policy.freeze_vision_encoder=false",
|
||||
"--policy.train_expert_only=false",
|
||||
"--policy.empty_cameras=1",
|
||||
"--policy.device=cuda",
|
||||
"--batch_size=1",
|
||||
_LIBERO_RENAME_CAMERAS,
|
||||
],
|
||||
},
|
||||
"wall_x": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/aloha_sim_insertion_human",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.type=wall_x",
|
||||
"--policy.pretrained_name_or_path=x-square-robot/wall-oss-flow",
|
||||
"--policy.prediction_mode=diffusion",
|
||||
"--policy.attn_implementation=eager",
|
||||
"--policy.device=cuda",
|
||||
"--batch_size=1",
|
||||
*_PI_SGD,
|
||||
],
|
||||
},
|
||||
"xvla": {
|
||||
"steps": 12,
|
||||
"train_args": [
|
||||
"--dataset.repo_id=lerobot/libero_plus",
|
||||
"--dataset.episodes=[0]",
|
||||
"--policy.path=lerobot/xvla-widowx",
|
||||
"--policy.action_mode=auto",
|
||||
"--policy.empty_cameras=1",
|
||||
"--policy.device=cuda",
|
||||
"--batch_size=1",
|
||||
'--rename_map={"observation.images.front": "observation.images.image", '
|
||||
'"observation.images.wrist": "observation.images.image2"}',
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TrainingProfiler — hooks the training loop.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _stable_float(value: float | int | None) -> float | None:
|
||||
return None if value is None else round(float(value), 8)
|
||||
|
||||
|
||||
def _as_float(value: Any) -> float:
|
||||
if isinstance(value, Real):
|
||||
return float(value)
|
||||
if hasattr(value, "val"):
|
||||
return float(value.val)
|
||||
raise TypeError(f"Expected a real-valued metric, got {type(value).__name__}")
|
||||
|
||||
|
||||
def _summary(values: list[float]) -> dict[str, float | int | None]:
|
||||
if not values:
|
||||
return {"count": 0, "mean": None, "median": None, "min": None, "max": None}
|
||||
return {
|
||||
"count": len(values),
|
||||
"mean": statistics.fmean(values),
|
||||
"median": statistics.median(values),
|
||||
"min": min(values),
|
||||
"max": max(values),
|
||||
}
|
||||
|
||||
|
||||
def _tensor_signature(tensor: torch.Tensor) -> dict[str, Any]:
|
||||
"""Small, stable summary of a tensor so forward-pass outputs can be
|
||||
compared across runs without bloating the regression JSON."""
|
||||
cpu = tensor.detach().cpu()
|
||||
hash_tensor = cpu.float() if cpu.dtype == torch.bfloat16 else cpu
|
||||
sig: dict[str, Any] = {
|
||||
"shape": list(cpu.shape),
|
||||
"dtype": str(cpu.dtype),
|
||||
"numel": cpu.numel(),
|
||||
"sha256": hashlib.sha256(hash_tensor.contiguous().numpy().tobytes()).hexdigest(),
|
||||
}
|
||||
if cpu.numel():
|
||||
promoted = cpu.to(torch.float64) if cpu.is_floating_point() else cpu.to(torch.int64)
|
||||
sig["sum"] = _stable_float(promoted.sum().item())
|
||||
sig["mean"] = _stable_float(promoted.float().mean().item())
|
||||
return sig
|
||||
|
||||
|
||||
def _summarize_value(value: Any) -> Any:
|
||||
if isinstance(value, torch.Tensor):
|
||||
return _tensor_signature(value)
|
||||
if isinstance(value, dict):
|
||||
return {k: _summarize_value(v) for k, v in value.items()}
|
||||
if isinstance(value, (list, tuple)):
|
||||
return [_summarize_value(v) for v in value]
|
||||
if isinstance(value, (str, int, float, bool)) or value is None:
|
||||
return value
|
||||
return repr(value)
|
||||
|
||||
|
||||
def _hash_payload(payload: Any) -> str:
|
||||
return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()
|
||||
|
||||
|
||||
def _get_profiler_device_time_us(event: Any) -> float | None:
|
||||
return _stable_float(
|
||||
getattr(event, "self_device_time_total", getattr(event, "self_cuda_time_total", None))
|
||||
)
|
||||
|
||||
|
||||
def _write_profiler_table(profiler: Any, path: Path, *, sort_by: str, row_limit: int = 40) -> None:
|
||||
try:
|
||||
path.write_text(profiler.key_averages().table(sort_by=sort_by, row_limit=row_limit))
|
||||
except Exception:
|
||||
logger.debug("Could not write profiler table for sort_by=%s", sort_by, exc_info=True)
|
||||
|
||||
|
||||
def write_deterministic_forward_artifacts(
|
||||
*,
|
||||
policy: Any,
|
||||
dataset: Any,
|
||||
batch_size: int,
|
||||
preprocessor: Any,
|
||||
output_dir: Path,
|
||||
device_type: str,
|
||||
) -> None:
|
||||
"""Run a seed-controlled single forward pass and dump a stable fingerprint
|
||||
(loss/output tensor hashes + op counts) for regression detection. Keeps
|
||||
the caller-selected module mode so ACT-with-VAE-style policies that only
|
||||
materialize their full forward outputs in `train()` still match. Models
|
||||
with stochastic train-mode layers still rely on the seeded RNG for stable
|
||||
fingerprints."""
|
||||
if len(dataset) == 0:
|
||||
raise ValueError("Cannot build a reference batch from an empty dataset.")
|
||||
indices = [i % len(dataset) for i in range(batch_size)]
|
||||
reference_batch = default_collate([dataset[i] for i in indices])
|
||||
# Mirror the uint8 → float32/255 conversion the train loop applies after
|
||||
# the dataloader (PR #3406). The dataset ships camera frames as uint8 for
|
||||
# faster transport, but policies like SmolVLA/xVLA run bilinear
|
||||
# interpolation on images which doesn't support Byte tensors.
|
||||
camera_keys = tuple(getattr(getattr(dataset, "meta", None), "camera_keys", ()) or ())
|
||||
if not camera_keys:
|
||||
camera_keys = tuple(
|
||||
key
|
||||
for key, value in reference_batch.items()
|
||||
if key.startswith("observation.images.") and isinstance(value, torch.Tensor)
|
||||
)
|
||||
for cam_key in camera_keys:
|
||||
if cam_key in reference_batch and reference_batch[cam_key].dtype == torch.uint8:
|
||||
reference_batch[cam_key] = reference_batch[cam_key].to(dtype=torch.float32) / 255.0
|
||||
reference_batch = preprocessor(reference_batch)
|
||||
|
||||
activities = [torch.profiler.ProfilerActivity.CPU]
|
||||
if device_type == "cuda":
|
||||
activities.append(torch.profiler.ProfilerActivity.CUDA)
|
||||
|
||||
with torch.random.fork_rng(devices=[] if device_type != "cuda" else None):
|
||||
torch.manual_seed(0)
|
||||
if device_type == "cuda":
|
||||
torch.cuda.manual_seed_all(0)
|
||||
with torch.no_grad(), torch.profiler.profile(activities=activities) as prof:
|
||||
loss, output_dict = policy.forward(reference_batch)
|
||||
|
||||
operators = sorted(
|
||||
(
|
||||
{
|
||||
"key": e.key,
|
||||
"count": e.count,
|
||||
"cpu_time_total_us": _stable_float(getattr(e, "cpu_time_total", None)),
|
||||
**(
|
||||
{"self_cuda_time_total_us": _get_profiler_device_time_us(e)}
|
||||
if device_type == "cuda"
|
||||
else {}
|
||||
),
|
||||
}
|
||||
for e in prof.key_averages()
|
||||
),
|
||||
key=lambda e: e["key"],
|
||||
)
|
||||
outputs = {"loss": _summarize_value(loss), "output_dict": _summarize_value(output_dict)}
|
||||
payload = {
|
||||
"seed": 0,
|
||||
"reference_batch_size": batch_size,
|
||||
"operator_fingerprint": _hash_payload([(o["key"], o["count"]) for o in operators]),
|
||||
"output_fingerprint": _hash_payload(outputs),
|
||||
"operators": operators,
|
||||
"outputs": outputs,
|
||||
}
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
(output_dir / "deterministic_forward.json").write_text(json.dumps(payload, indent=2, sort_keys=True))
|
||||
sort_by = "self_cuda_time_total" if device_type == "cuda" else "cpu_time_total"
|
||||
_write_profiler_table(prof, output_dir / "deterministic_forward_ops.txt", sort_by=sort_by)
|
||||
|
||||
|
||||
class TrainingProfiler:
|
||||
"""Self-contained profiling hooks for the training loop.
|
||||
|
||||
The training script interacts via ``start()``, ``section()``, ``step()``,
|
||||
``finalize()``, and (optionally) ``record_deterministic_forward()`` — a
|
||||
~7-line surface.
|
||||
"""
|
||||
|
||||
_SCHEDULE_WAIT = 1
|
||||
_SCHEDULE_WARMUP = 2
|
||||
_SCHEDULE_ACTIVE = 6
|
||||
|
||||
def __init__(self, mode: str, output_dir: Path, device: torch.device) -> None:
|
||||
self._mode = mode
|
||||
self._output_dir = output_dir
|
||||
self._output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._device = device
|
||||
# Inline timing state — no separate collector class.
|
||||
self._total_update_s: list[float] = []
|
||||
self._dataloading_s: list[float] = []
|
||||
self._section_s: dict[str, list[float]] = {}
|
||||
self._memory: list[dict[str, int]] = []
|
||||
self._torch = self._build_torch_profiler()
|
||||
logger.info("Profiling enabled. Artifacts will be written to %s", output_dir)
|
||||
|
||||
def _build_torch_profiler(self) -> Any:
|
||||
activities = [torch.profiler.ProfilerActivity.CPU]
|
||||
if self._device.type == "cuda":
|
||||
activities.append(torch.profiler.ProfilerActivity.CUDA)
|
||||
trace_dir = self._output_dir / "torch_traces"
|
||||
trace_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _on_trace_ready(p: Any) -> None:
|
||||
if self._mode == "trace":
|
||||
p.export_chrome_trace(str(trace_dir / f"trace_step_{p.step_num}.json"))
|
||||
|
||||
return torch.profiler.profile(
|
||||
activities=activities,
|
||||
schedule=torch.profiler.schedule(
|
||||
wait=self._SCHEDULE_WAIT,
|
||||
warmup=self._SCHEDULE_WARMUP,
|
||||
active=self._SCHEDULE_ACTIVE,
|
||||
repeat=1,
|
||||
),
|
||||
on_trace_ready=_on_trace_ready,
|
||||
record_shapes=True,
|
||||
profile_memory=True,
|
||||
with_flops=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_cfg(cls, cfg: Any, device: torch.device) -> TrainingProfiler:
|
||||
output = cfg.profile_output_dir or (Path(cfg.output_dir) / "profiling")
|
||||
return cls(mode=cfg.profile_mode, output_dir=Path(output), device=device)
|
||||
|
||||
def record_deterministic_forward(
|
||||
self, *, policy: Any, dataset: Any, batch_size: int, preprocessor: Any
|
||||
) -> None:
|
||||
logger.info("Recording deterministic forward-pass artifacts")
|
||||
write_deterministic_forward_artifacts(
|
||||
policy=policy,
|
||||
dataset=dataset,
|
||||
batch_size=batch_size,
|
||||
preprocessor=preprocessor,
|
||||
output_dir=self._output_dir,
|
||||
device_type=self._device.type,
|
||||
)
|
||||
if self._device.type == "cuda":
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def start(self) -> None:
|
||||
if self._device.type == "cuda":
|
||||
torch.cuda.reset_peak_memory_stats(self._device)
|
||||
self._torch.__enter__()
|
||||
|
||||
@contextmanager
|
||||
def section(self, name: str) -> Iterator[None]:
|
||||
"""Time a region of the training step. Syncs on CUDA so the
|
||||
duration reflects GPU work, not just kernel-launch latency."""
|
||||
if self._device.type == "cuda":
|
||||
torch.cuda.synchronize(self._device)
|
||||
t0 = time.perf_counter()
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if self._device.type == "cuda":
|
||||
torch.cuda.synchronize(self._device)
|
||||
self._section_s.setdefault(name, []).append(time.perf_counter() - t0)
|
||||
|
||||
def step(self, step_num: int, train_tracker: Any) -> None:
|
||||
self._total_update_s.append(_as_float(train_tracker.update_s))
|
||||
self._dataloading_s.append(_as_float(train_tracker.dataloading_s))
|
||||
if self._device.type == "cuda":
|
||||
self._memory.append(
|
||||
{
|
||||
"step": step_num,
|
||||
"allocated_bytes": torch.cuda.memory_allocated(self._device),
|
||||
"reserved_bytes": torch.cuda.memory_reserved(self._device),
|
||||
}
|
||||
)
|
||||
self._torch.step()
|
||||
|
||||
def finalize(self) -> None:
|
||||
self._torch.__exit__(None, None, None)
|
||||
payload: dict[str, Any] = {
|
||||
"profile_mode": self._mode,
|
||||
"total_update_s": _summary(self._total_update_s),
|
||||
"dataloading_s": _summary(self._dataloading_s),
|
||||
"memory_timeline": self._memory,
|
||||
}
|
||||
for name, values in self._section_s.items():
|
||||
payload[f"{name}_s"] = _summary(values)
|
||||
if self._device.type == "cuda":
|
||||
payload["peak_memory_allocated_bytes"] = torch.cuda.max_memory_allocated(self._device)
|
||||
payload["peak_memory_reserved_bytes"] = torch.cuda.max_memory_reserved(self._device)
|
||||
(self._output_dir / "step_timing_summary.json").write_text(
|
||||
json.dumps(payload, indent=2, sort_keys=True)
|
||||
)
|
||||
|
||||
tables_dir = self._output_dir / "torch_tables"
|
||||
tables_dir.mkdir(parents=True, exist_ok=True)
|
||||
_write_profiler_table(self._torch, tables_dir / "cpu_time_total.txt", sort_by="cpu_time_total")
|
||||
_write_profiler_table(self._torch, tables_dir / "cpu_memory.txt", sort_by="self_cpu_memory_usage")
|
||||
_write_profiler_table(self._torch, tables_dir / "flops.txt", sort_by="flops")
|
||||
if self._device.type == "cuda":
|
||||
_write_profiler_table(
|
||||
self._torch, tables_dir / "cuda_time_total.txt", sort_by="self_cuda_time_total"
|
||||
)
|
||||
_write_profiler_table(
|
||||
self._torch, tables_dir / "cuda_memory.txt", sort_by="self_cuda_memory_usage"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CI orchestrator. Spawns `lerobot-train` per policy, collects the
|
||||
# artifacts, (optionally) uploads to the HF Hub results dataset.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UploadTarget:
|
||||
local_path: Path
|
||||
path_in_repo: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UploadResult:
|
||||
uploaded_paths: dict[str, str]
|
||||
pr_url: str | None = None
|
||||
|
||||
|
||||
def _utc_timestamp_slug(now: datetime | None = None) -> str:
|
||||
return (now or datetime.now(UTC)).strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
|
||||
def _hub_file_url(repo_id: str, path_in_repo: str, *, revision: str = "main") -> str:
|
||||
return f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{path_in_repo}"
|
||||
|
||||
|
||||
def parse_discussion_num(pr_url: str | None) -> int | None:
|
||||
if not pr_url:
|
||||
return None
|
||||
m = re.search(r"/discussions/(\d+)$", pr_url)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def upload_targets(
|
||||
repo_id: str,
|
||||
targets: list[UploadTarget],
|
||||
*,
|
||||
token: str | None = None,
|
||||
commit_message: str | None = None,
|
||||
create_pr: bool = False,
|
||||
) -> UploadResult:
|
||||
api = HfApi(token=token)
|
||||
commit = api.create_commit(
|
||||
repo_id=repo_id,
|
||||
repo_type="dataset",
|
||||
operations=[
|
||||
CommitOperationAdd(path_in_repo=t.path_in_repo, path_or_fileobj=str(t.local_path))
|
||||
for t in targets
|
||||
],
|
||||
commit_message=commit_message or f"Upload {len(targets)} profiling artifacts",
|
||||
revision="main",
|
||||
create_pr=create_pr,
|
||||
)
|
||||
pr_num = parse_discussion_num(commit.pr_url)
|
||||
revision = f"refs/pr/{pr_num}" if (create_pr and pr_num) else "main"
|
||||
return UploadResult(
|
||||
uploaded_paths={
|
||||
t.path_in_repo: _hub_file_url(repo_id, t.path_in_repo, revision=revision) for t in targets
|
||||
},
|
||||
pr_url=commit.pr_url,
|
||||
)
|
||||
|
||||
|
||||
def build_train_command(policy: str, run_dir: Path, profile_mode: str) -> list[str]:
|
||||
spec = POLICY_SPECS[policy]
|
||||
return [
|
||||
"uv",
|
||||
"run",
|
||||
"lerobot-train",
|
||||
*spec["train_args"],
|
||||
f"--output_dir={run_dir / 'train'}",
|
||||
f"--steps={spec['steps']}",
|
||||
"--eval_freq=0",
|
||||
"--save_checkpoint=false",
|
||||
f"--save_freq={spec['steps']}",
|
||||
"--wandb.enable=false",
|
||||
"--policy.push_to_hub=false",
|
||||
"--num_workers=0",
|
||||
"--log_freq=1",
|
||||
f"--profile_mode={profile_mode}",
|
||||
f"--profile_output_dir={run_dir / 'profiling'}",
|
||||
]
|
||||
|
||||
|
||||
def build_artifact_index(
|
||||
*, repo_id: str, run_dir: Path, policy_name: str, run_id: str
|
||||
) -> tuple[dict[str, Any], dict[str, Any], list[UploadTarget], str]:
|
||||
"""Scan the run directory and categorize files into
|
||||
(stdout/stderr, torch_tables/*, torch_traces/*, everything else under profiling/).
|
||||
Returns (paths, urls, upload targets, row path in repo)."""
|
||||
row_path_in_repo = f"rows/{policy_name}/{run_id}.json"
|
||||
root = f"artifacts/{policy_name}/{run_id}"
|
||||
paths: dict[str, Any] = {
|
||||
"row": row_path_in_repo,
|
||||
"profiling_files": {},
|
||||
"torch_tables": {},
|
||||
"trace_files": {},
|
||||
}
|
||||
urls: dict[str, Any] = {
|
||||
"row": _hub_file_url(repo_id, row_path_in_repo),
|
||||
"profiling_files": {},
|
||||
"torch_tables": {},
|
||||
"trace_files": {},
|
||||
}
|
||||
targets: list[UploadTarget] = []
|
||||
|
||||
for name in ("stdout.txt", "stderr.txt"):
|
||||
p = run_dir / name
|
||||
if p.exists():
|
||||
key = name.removesuffix(".txt")
|
||||
repo = f"{root}/{name}"
|
||||
paths[key] = repo
|
||||
urls[key] = _hub_file_url(repo_id, repo)
|
||||
targets.append(UploadTarget(p, repo))
|
||||
|
||||
profiling_dir = run_dir / "profiling"
|
||||
if profiling_dir.exists():
|
||||
for p in sorted(profiling_dir.rglob("*")):
|
||||
if not p.is_file():
|
||||
continue
|
||||
rel = str(p.relative_to(run_dir))
|
||||
repo = f"{root}/{rel}"
|
||||
paths["profiling_files"][rel] = repo
|
||||
urls["profiling_files"][rel] = _hub_file_url(repo_id, repo)
|
||||
targets.append(UploadTarget(p, repo))
|
||||
if p.name == "step_timing_summary.json":
|
||||
paths["step_timing_summary"] = repo
|
||||
urls["step_timing_summary"] = _hub_file_url(repo_id, repo)
|
||||
elif "torch_tables" in p.parts:
|
||||
paths["torch_tables"][p.name] = repo
|
||||
urls["torch_tables"][p.name] = _hub_file_url(repo_id, repo)
|
||||
elif "torch_traces" in p.parts:
|
||||
paths["trace_files"][p.name] = repo
|
||||
urls["trace_files"][p.name] = _hub_file_url(repo_id, repo)
|
||||
|
||||
return paths, urls, targets, row_path_in_repo
|
||||
|
||||
|
||||
def upload_profile_run(
|
||||
*,
|
||||
repo_id: str,
|
||||
row_path: Path,
|
||||
row_path_in_repo: str,
|
||||
artifact_targets: list[UploadTarget],
|
||||
create_pr: bool = False,
|
||||
) -> UploadResult:
|
||||
return upload_targets(
|
||||
repo_id=repo_id,
|
||||
targets=[*artifact_targets, UploadTarget(row_path, row_path_in_repo)],
|
||||
commit_message=f"Add model profiling row {row_path_in_repo}",
|
||||
create_pr=create_pr,
|
||||
)
|
||||
|
||||
|
||||
def _load_json(path: Path) -> dict[str, Any]:
|
||||
return json.loads(path.read_text()) if path.exists() else {}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--policies", nargs="*", default=None)
|
||||
parser.add_argument("--output_dir", type=Path, required=True)
|
||||
parser.add_argument("--hub_org", default="lerobot")
|
||||
parser.add_argument("--results_repo", default="model-profiling-history")
|
||||
parser.add_argument("--publish", action="store_true")
|
||||
parser.add_argument("--profile_mode", choices=["summary", "trace"], default="trace")
|
||||
parser.add_argument("--git_commit", default="")
|
||||
parser.add_argument("--git_ref", default="")
|
||||
parser.add_argument("--pr_number", default="")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
selected = args.policies or list(POLICY_SPECS)
|
||||
unknown = sorted(set(selected) - set(POLICY_SPECS))
|
||||
if unknown:
|
||||
raise ValueError(f"Unknown profiling policies: {', '.join(unknown)}")
|
||||
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
repo_id = args.results_repo if "/" in args.results_repo else f"{args.hub_org}/{args.results_repo}"
|
||||
git_exe = shutil.which("git")
|
||||
if not git_exe:
|
||||
raise RuntimeError("git not found in PATH")
|
||||
git_commit = args.git_commit or subprocess.check_output([git_exe, "rev-parse", "HEAD"], text=True).strip()
|
||||
pr_number = int(args.pr_number) if str(args.pr_number).strip() else None
|
||||
exit_code = 0
|
||||
|
||||
for policy in selected:
|
||||
run_id = f"{_utc_timestamp_slug()}__{policy}"
|
||||
run_dir = args.output_dir / policy / run_id
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
cmd = build_train_command(policy, run_dir, args.profile_mode)
|
||||
|
||||
t0 = time.perf_counter()
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
wall_s = time.perf_counter() - t0
|
||||
|
||||
(run_dir / "stdout.txt").write_text(result.stdout)
|
||||
(run_dir / "stderr.txt").write_text(result.stderr)
|
||||
if result.returncode != 0:
|
||||
exit_code = 1
|
||||
|
||||
paths, urls, upload_list, row_in_repo = build_artifact_index(
|
||||
repo_id=repo_id, run_dir=run_dir, policy_name=policy, run_id=run_id
|
||||
)
|
||||
row: dict[str, Any] = {
|
||||
"schema_version": 1,
|
||||
"created_at": datetime.now(UTC).isoformat(),
|
||||
"run_id": run_id,
|
||||
"policy": policy,
|
||||
"git_commit": git_commit,
|
||||
"git_ref": args.git_ref or None,
|
||||
"pr_number": pr_number,
|
||||
"status": "success" if result.returncode == 0 else "failed",
|
||||
"return_code": result.returncode,
|
||||
"profile_mode": args.profile_mode,
|
||||
"wall_time_s": wall_s,
|
||||
"spec": {
|
||||
"steps": POLICY_SPECS[policy]["steps"],
|
||||
"train_args": POLICY_SPECS[policy]["train_args"],
|
||||
},
|
||||
"step_timing_summary": _load_json(run_dir / "profiling" / "step_timing_summary.json"),
|
||||
"deterministic_forward": _load_json(run_dir / "profiling" / "deterministic_forward.json"),
|
||||
"artifact_paths": paths,
|
||||
"artifact_urls": urls,
|
||||
"stderr_tail": result.stderr.splitlines()[-20:],
|
||||
}
|
||||
|
||||
row_path = run_dir / "profiling_row.json"
|
||||
row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
|
||||
|
||||
if args.publish:
|
||||
try:
|
||||
uploaded = upload_profile_run(
|
||||
repo_id=repo_id,
|
||||
row_path=row_path,
|
||||
row_path_in_repo=row_in_repo,
|
||||
artifact_targets=upload_list,
|
||||
create_pr=pr_number is not None,
|
||||
)
|
||||
except HfHubHTTPError as exc:
|
||||
row.update({"publish_status": "failed", "publish_error": str(exc)})
|
||||
else:
|
||||
row.update(
|
||||
{
|
||||
"publish_status": "success",
|
||||
"uploaded_paths": uploaded.uploaded_paths,
|
||||
"publish_pr_url": uploaded.pr_url,
|
||||
"publish_pr_number": parse_discussion_num(uploaded.pr_url),
|
||||
}
|
||||
)
|
||||
row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
|
||||
|
||||
print(json.dumps(row, indent=2, sort_keys=True))
|
||||
|
||||
return exit_code
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,348 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from huggingface_hub.errors import HfHubHTTPError
|
||||
|
||||
from lerobot.utils import model_profiling as mp
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Policy spec matrix
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_policy_specs_cover_expected_policies():
|
||||
assert set(mp.POLICY_SPECS) == {
|
||||
"act",
|
||||
"diffusion",
|
||||
"groot",
|
||||
"multi_task_dit",
|
||||
"pi0",
|
||||
"pi0_fast",
|
||||
"pi05",
|
||||
"smolvla",
|
||||
"wall_x",
|
||||
"xvla",
|
||||
}
|
||||
# Sanity: excluded policies should stay out of the matrix.
|
||||
for excluded in ("sac", "sarm", "tdmpc", "vqbet", "reward_classifier"):
|
||||
assert excluded not in mp.POLICY_SPECS
|
||||
|
||||
|
||||
def test_pretrained_libero_specs_match_expected_camera_keys_and_normalization():
|
||||
base_rgb_rename = (
|
||||
'--rename_map={"observation.images.front": "observation.images.base_0_rgb", '
|
||||
'"observation.images.wrist": "observation.images.left_wrist_0_rgb"}'
|
||||
)
|
||||
for name in ("pi0", "pi0_fast", "pi05"):
|
||||
assert base_rgb_rename in mp.POLICY_SPECS[name]["train_args"]
|
||||
assert any(
|
||||
arg.startswith('--policy.normalization_mapping={"ACTION": "MEAN_STD"')
|
||||
for arg in mp.POLICY_SPECS["pi05"]["train_args"]
|
||||
)
|
||||
assert (
|
||||
'--rename_map={"observation.images.front": "observation.images.camera1", '
|
||||
'"observation.images.wrist": "observation.images.camera2"}'
|
||||
in mp.POLICY_SPECS["smolvla"]["train_args"]
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CI orchestrator helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_build_train_command_includes_profiling_outputs(tmp_path):
|
||||
cmd = mp.build_train_command("act", tmp_path / "run", "trace")
|
||||
assert cmd[:3] == ["uv", "run", "lerobot-train"]
|
||||
assert any(a.startswith("--output_dir=") for a in cmd)
|
||||
assert any(a.startswith("--profile_output_dir=") for a in cmd)
|
||||
assert "--profile_mode=trace" in cmd
|
||||
assert "--eval_freq=0" in cmd
|
||||
|
||||
|
||||
def test_build_artifact_index_collects_tables_and_traces(tmp_path):
|
||||
run_dir = tmp_path / "act" / "20260415T000000Z__act"
|
||||
profiling = run_dir / "profiling"
|
||||
(profiling / "torch_tables").mkdir(parents=True)
|
||||
(profiling / "torch_traces").mkdir(parents=True)
|
||||
(profiling / "step_timing_summary.json").write_text("{}")
|
||||
(profiling / "deterministic_forward.json").write_text(
|
||||
json.dumps({"operator_fingerprint": "ops", "output_fingerprint": "out"})
|
||||
)
|
||||
(profiling / "torch_tables" / "cpu_time_total.txt").write_text("cpu table")
|
||||
(profiling / "torch_traces" / "trace_step_9.json").write_text("{}")
|
||||
(run_dir / "stdout.txt").write_text("stdout")
|
||||
(run_dir / "stderr.txt").write_text("stderr")
|
||||
|
||||
paths, urls, targets, row_in_repo = mp.build_artifact_index(
|
||||
repo_id="lerobot/model-profiling-history",
|
||||
run_dir=run_dir,
|
||||
policy_name="act",
|
||||
run_id="20260415T000000Z__act",
|
||||
)
|
||||
|
||||
assert row_in_repo == "rows/act/20260415T000000Z__act.json"
|
||||
assert paths["stdout"].endswith("/stdout.txt")
|
||||
assert paths["step_timing_summary"].endswith("/profiling/step_timing_summary.json")
|
||||
assert "cpu_time_total.txt" in paths["torch_tables"]
|
||||
assert "trace_step_9.json" in paths["trace_files"]
|
||||
assert urls["row"].startswith("https://huggingface.co/datasets/lerobot/model-profiling-history/")
|
||||
# stdout + stderr + 4 profiling files
|
||||
assert len(targets) == 6
|
||||
|
||||
|
||||
def test_upload_targets_batches_preview_publish_into_single_hf_pr(monkeypatch, tmp_path):
|
||||
local_path = tmp_path / "profiling_row.json"
|
||||
local_path.write_text("{}")
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
class _FakeCommit:
|
||||
pr_url = "https://huggingface.co/datasets/lerobot/model-profiling-history/discussions/42"
|
||||
|
||||
class _FakeApi:
|
||||
def __init__(self, token=None):
|
||||
captured["token"] = token
|
||||
|
||||
def create_commit(self, **kwargs):
|
||||
captured.update(kwargs)
|
||||
return _FakeCommit()
|
||||
|
||||
monkeypatch.setattr(mp, "HfApi", _FakeApi)
|
||||
|
||||
result = mp.upload_targets(
|
||||
repo_id="lerobot/model-profiling-history",
|
||||
targets=[mp.UploadTarget(local_path, "rows/act/run.json")],
|
||||
create_pr=True,
|
||||
token="hf_test_token",
|
||||
)
|
||||
|
||||
assert captured["repo_id"] == "lerobot/model-profiling-history"
|
||||
assert captured["repo_type"] == "dataset"
|
||||
assert captured["create_pr"] is True
|
||||
assert result.pr_url == _FakeCommit.pr_url
|
||||
assert result.uploaded_paths["rows/act/run.json"].endswith("/resolve/refs/pr/42/rows/act/run.json")
|
||||
|
||||
|
||||
def test_parse_discussion_num_handles_hf_discussion_urls():
|
||||
assert (
|
||||
mp.parse_discussion_num(
|
||||
"https://huggingface.co/datasets/lerobot/model-profiling-history/discussions/42"
|
||||
)
|
||||
== 42
|
||||
)
|
||||
assert mp.parse_discussion_num("https://huggingface.co/datasets/lerobot/model-profiling-history") is None
|
||||
assert mp.parse_discussion_num(None) is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# main() smoke tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_args(tmp_path):
|
||||
"""Shared argparse namespace for main() smoke tests — overridden per-test."""
|
||||
return argparse.Namespace(
|
||||
policies=["act"],
|
||||
output_dir=tmp_path / "results",
|
||||
hub_org="lerobot",
|
||||
results_repo="model-profiling-history",
|
||||
publish=False,
|
||||
profile_mode="summary",
|
||||
git_commit="",
|
||||
git_ref="codex/model-profiling",
|
||||
pr_number="3389",
|
||||
)
|
||||
|
||||
|
||||
def _stub_train_subprocess(mp_module, *, returncode: int = 0, write_artifacts: bool = True):
|
||||
"""Build a fake subprocess.run that writes the profiling artifacts main() expects."""
|
||||
|
||||
def _fake_run(cmd, capture_output, text):
|
||||
assert capture_output is True
|
||||
assert text is True
|
||||
profile_dir = Path(next(a.split("=", 1)[1] for a in cmd if a.startswith("--profile_output_dir=")))
|
||||
profile_dir.mkdir(parents=True, exist_ok=True)
|
||||
if write_artifacts:
|
||||
(profile_dir / "torch_tables").mkdir(parents=True, exist_ok=True)
|
||||
(profile_dir / "step_timing_summary.json").write_text(
|
||||
json.dumps({"total_update_s": {"count": 1, "mean": 0.3}, "peak_memory_allocated_bytes": 1024})
|
||||
)
|
||||
(profile_dir / "deterministic_forward.json").write_text(
|
||||
json.dumps(
|
||||
{"operator_fingerprint": "ops-fingerprint", "output_fingerprint": "output-fingerprint"}
|
||||
)
|
||||
)
|
||||
(profile_dir / "torch_tables" / "cpu_time_total.txt").write_text("cpu time table")
|
||||
return subprocess.CompletedProcess(cmd, returncode, "stdout ok", "")
|
||||
|
||||
return _fake_run
|
||||
|
||||
|
||||
def test_main_smoke_writes_row(monkeypatch, fake_args):
|
||||
monkeypatch.setattr(mp, "parse_args", lambda: fake_args)
|
||||
monkeypatch.setattr(mp.subprocess, "check_output", lambda *a, **k: "deadbeef\n")
|
||||
monkeypatch.setattr(mp.subprocess, "run", _stub_train_subprocess(mp))
|
||||
|
||||
assert mp.main() == 0
|
||||
|
||||
row_paths = list(fake_args.output_dir.rglob("profiling_row.json"))
|
||||
assert len(row_paths) == 1
|
||||
row = json.loads(row_paths[0].read_text())
|
||||
assert row["policy"] == "act"
|
||||
assert row["status"] == "success"
|
||||
assert row["git_commit"] == "deadbeef"
|
||||
assert row["git_ref"] == "codex/model-profiling"
|
||||
assert row["pr_number"] == 3389
|
||||
assert row["step_timing_summary"]["total_update_s"]["mean"] == 0.3
|
||||
assert row["deterministic_forward"]["operator_fingerprint"] == "ops-fingerprint"
|
||||
|
||||
|
||||
def test_main_records_publish_failure_without_failing(monkeypatch, fake_args):
|
||||
fake_args.publish = True
|
||||
fake_args.git_commit = "deadbeef"
|
||||
monkeypatch.setattr(mp, "parse_args", lambda: fake_args)
|
||||
monkeypatch.setattr(mp.subprocess, "run", _stub_train_subprocess(mp, write_artifacts=False))
|
||||
|
||||
def _fail_upload(**kwargs):
|
||||
resp = type("Resp", (), {"status_code": 403, "headers": {}, "request": None})()
|
||||
raise HfHubHTTPError("403 Forbidden: Authorization error.", response=resp)
|
||||
|
||||
monkeypatch.setattr(mp, "upload_profile_run", _fail_upload)
|
||||
|
||||
assert mp.main() == 0
|
||||
row = json.loads(next(fake_args.output_dir.rglob("profiling_row.json")).read_text())
|
||||
assert row["status"] == "success"
|
||||
assert row["publish_status"] == "failed"
|
||||
assert "Authorization error" in row["publish_error"]
|
||||
|
||||
|
||||
def test_main_returns_nonzero_when_training_subprocess_fails(monkeypatch, fake_args):
|
||||
monkeypatch.setattr(mp, "parse_args", lambda: fake_args)
|
||||
monkeypatch.setattr(mp.subprocess, "check_output", lambda *a, **k: "deadbeef\n")
|
||||
monkeypatch.setattr(mp.subprocess, "run", _stub_train_subprocess(mp, returncode=3))
|
||||
|
||||
assert mp.main() == 1
|
||||
|
||||
row = json.loads(next(fake_args.output_dir.rglob("profiling_row.json")).read_text())
|
||||
assert row["status"] == "failed"
|
||||
assert row["return_code"] == 3
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TrainingProfiler behavior
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_deterministic_forward_artifacts_preserve_policy_mode(tmp_path):
|
||||
class _TrainingOnlyPolicy(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.forward_calls = 0
|
||||
|
||||
def forward(self, batch):
|
||||
self.forward_calls += 1
|
||||
assert self.training
|
||||
return batch["value"].sum(), {"value": batch["value"]}
|
||||
|
||||
dataset = [{"value": torch.tensor([1.0, 2.0])}]
|
||||
policy = _TrainingOnlyPolicy()
|
||||
policy.train()
|
||||
|
||||
mp.write_deterministic_forward_artifacts(
|
||||
policy=policy,
|
||||
dataset=dataset,
|
||||
batch_size=2,
|
||||
preprocessor=lambda b: b,
|
||||
output_dir=tmp_path,
|
||||
device_type="cpu",
|
||||
)
|
||||
|
||||
payload = json.loads((tmp_path / "deterministic_forward.json").read_text())
|
||||
assert policy.training is True
|
||||
assert policy.forward_calls == 1
|
||||
assert payload["reference_batch_size"] == 2
|
||||
assert "operator_fingerprint" in payload
|
||||
assert payload["outputs"]["loss"]["numel"] == 1
|
||||
|
||||
|
||||
def test_deterministic_forward_artifacts_infers_image_keys_without_dataset_meta(tmp_path):
|
||||
class _ImagePolicy(torch.nn.Module):
|
||||
def forward(self, batch):
|
||||
image = batch["observation.images.front"]
|
||||
assert image.dtype == torch.float32
|
||||
assert torch.all((image >= 0.0) & (image <= 1.0))
|
||||
return image.sum(), {"image": image}
|
||||
|
||||
dataset = [{"observation.images.front": torch.tensor([[[0, 255]]], dtype=torch.uint8)}]
|
||||
|
||||
mp.write_deterministic_forward_artifacts(
|
||||
policy=_ImagePolicy(),
|
||||
dataset=dataset,
|
||||
batch_size=1,
|
||||
preprocessor=lambda b: b,
|
||||
output_dir=tmp_path,
|
||||
device_type="cpu",
|
||||
)
|
||||
|
||||
payload = json.loads((tmp_path / "deterministic_forward.json").read_text())
|
||||
assert payload["outputs"]["loss"]["numel"] == 1
|
||||
assert payload["outputs"]["output_dict"]["image"]["dtype"] == "torch.float32"
|
||||
|
||||
|
||||
def test_training_profiler_section_records_forward_backward_optimizer(tmp_path):
|
||||
profiler = mp.TrainingProfiler(mode="summary", output_dir=tmp_path, device=torch.device("cpu"))
|
||||
profiler.start()
|
||||
for _ in range(3):
|
||||
with profiler.section("forward"):
|
||||
pass
|
||||
with profiler.section("backward"):
|
||||
pass
|
||||
with profiler.section("optimizer"):
|
||||
pass
|
||||
profiler.step(1, argparse.Namespace(update_s=0.5, dataloading_s=0.01))
|
||||
profiler.finalize()
|
||||
|
||||
payload = json.loads((tmp_path / "step_timing_summary.json").read_text())
|
||||
assert payload["forward_s"]["count"] == 3
|
||||
assert payload["backward_s"]["count"] == 3
|
||||
assert payload["optimizer_s"]["count"] == 3
|
||||
assert payload["total_update_s"]["mean"] == 0.5
|
||||
|
||||
|
||||
def test_training_profiler_accepts_metric_like_values(tmp_path):
|
||||
class _MetricLike:
|
||||
def __init__(self, v):
|
||||
self.val = v
|
||||
|
||||
profiler = mp.TrainingProfiler(mode="summary", output_dir=tmp_path, device=torch.device("cpu"))
|
||||
profiler.start()
|
||||
profiler.step(1, argparse.Namespace(update_s=_MetricLike(0.6), dataloading_s=_MetricLike(0.05)))
|
||||
profiler.finalize()
|
||||
|
||||
payload = json.loads((tmp_path / "step_timing_summary.json").read_text())
|
||||
assert payload["total_update_s"]["mean"] == 0.6
|
||||
assert payload["dataloading_s"]["mean"] == 0.05
|
||||
|
||||
|
||||
def test_profiler_device_time_uses_generic_attr_first():
|
||||
class _Event:
|
||||
self_device_time_total = 12.3456
|
||||
|
||||
assert mp._get_profiler_device_time_us(_Event()) == 12.3456
|
||||
Reference in New Issue
Block a user