mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-18 08:47:05 +00:00
chore: drop one-off bench/build/train scripts from the PR
Remove development-only tooling that doesn't belong in the PR: - examples/benchmark/* (pi052 step/kernel benchmark slurm + harness) - examples/port_datasets/slurm_build_robocasa_composite_seen.py and src/lerobot/scripts/build_robocasa_composite_seen.py (composite_seen dataset build scripts) - scripts/build_episode_filter.py, scripts/build_robocasa_smoke.sh, scripts/train_pi052_human300_exclude_unannotated.sh None are imported by the library, tests, or entry points. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -1,74 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench-pi052-kernels
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=01:30:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --output=/fsx/pepijn/logs/bench_pi052_kernels_%j.out
|
||||
|
||||
# HF kernels exploration via Liger's apply_liger_kernel_to_paligemma.
|
||||
# Baseline (SDPA, no kernels) vs. per-subkernel ablations vs. all-on.
|
||||
# Same harness as bench_pi052_step.py — only the --kernels flag varies
|
||||
# across runs so any delta is attributable to the patched op(s).
|
||||
#
|
||||
# Subkernels exercised: rope, rms_norm, geglu, layer_norm.
|
||||
# Skipped: cross_entropy / fused_linear_cross_entropy — pi052 calls
|
||||
# F.cross_entropy directly and bypasses PaliGemma's forward, so those
|
||||
# patches wouldn't fire without model-code changes (separate PR).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
|
||||
# /fsx triton cache is shared across nodes with different glibc versions
|
||||
# — kernels built on one node trip GLIBC_2.34-not-found on another. Use
|
||||
# a node-local cache per job to side-step that.
|
||||
export TRITON_CACHE_DIR="/tmp/triton_${SLURM_JOB_ID}"
|
||||
export TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${SLURM_JOB_ID}"
|
||||
mkdir -p "$TRITON_CACHE_DIR" "$TORCHINDUCTOR_CACHE_DIR"
|
||||
|
||||
echo "=== Node: $(hostname) ==="
|
||||
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
|
||||
ldd --version | head -1
|
||||
|
||||
# Liger isn't in our standard env yet — install on the compute node so
|
||||
# the slurm log captures the exact version that produced the numbers.
|
||||
python -m pip install -q --upgrade 'liger-kernel'
|
||||
python - <<'PY' || true
|
||||
from importlib.metadata import version, PackageNotFoundError
|
||||
try:
|
||||
print("liger-kernel", version("liger-kernel"))
|
||||
except PackageNotFoundError:
|
||||
print("liger-kernel: not importable")
|
||||
import liger_kernel.transformers as t
|
||||
print("apply_liger_kernel_to_paligemma:", hasattr(t, "apply_liger_kernel_to_paligemma"))
|
||||
PY
|
||||
|
||||
run() {
|
||||
echo
|
||||
echo "--- $* ---"
|
||||
python examples/benchmark/bench_pi052_step.py "$@" || true
|
||||
}
|
||||
|
||||
# -- Baseline (no kernels) at the BS we actually train at. --
|
||||
run --attn sdpa --batch-size 8 --kernels none
|
||||
run --attn sdpa --batch-size 16 --kernels none
|
||||
|
||||
# -- Per-subkernel ablations at BS=16 to isolate each contributor. --
|
||||
run --attn sdpa --batch-size 16 --kernels rms_norm
|
||||
run --attn sdpa --batch-size 16 --kernels geglu
|
||||
run --attn sdpa --batch-size 16 --kernels layer_norm
|
||||
run --attn sdpa --batch-size 16 --kernels rope
|
||||
|
||||
# -- All-on, both BS to compare against the matched baselines above. --
|
||||
run --attn sdpa --batch-size 8 --kernels all
|
||||
run --attn sdpa --batch-size 16 --kernels all
|
||||
|
||||
# -- Headroom check: does kernels-all let BS=24 fit (baseline OOMs near here)? --
|
||||
run --attn sdpa --batch-size 24 --kernels none
|
||||
run --attn sdpa --batch-size 24 --kernels all
|
||||
@@ -1,338 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Benchmark ``PI052Policy.forward + backward`` on a single GPU.
|
||||
|
||||
Compares the new SDPA attention path against the eager baseline by
|
||||
monkeypatching ``sdpa_attention_forward`` before the first model
|
||||
forward — so both runs share identical Q/K/V plumbing and only the
|
||||
attention kernel differs. Reports steps/sec and peak GPU memory.
|
||||
|
||||
SLURM-only:
|
||||
|
||||
sbatch examples/benchmark/bench_pi052_step.slurm
|
||||
|
||||
Or one-off:
|
||||
|
||||
srun --partition=hopper-prod --qos=high --gpus=1 --time=15 \\
|
||||
python examples/benchmark/bench_pi052_step.py --attn sdpa --batch-size 8
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def _maybe_patch_eager() -> None:
|
||||
"""Swap ``sdpa_attention_forward`` for the original eager forward.
|
||||
|
||||
Must be called BEFORE PI052Policy is instantiated — the layer
|
||||
compute functions resolve the symbol at call time (module-level
|
||||
lookup), so this patch covers both pi05 and pi052 KI paths."""
|
||||
from transformers.models.gemma import modeling_gemma
|
||||
|
||||
from lerobot.policies.pi05 import modeling_pi05
|
||||
|
||||
modeling_pi05.sdpa_attention_forward = modeling_gemma.eager_attention_forward
|
||||
|
||||
|
||||
_LIGER_SUBKERNELS = ("rope", "rms_norm", "geglu", "layer_norm")
|
||||
|
||||
|
||||
def _maybe_patch_liger(spec: str) -> dict:
|
||||
"""Globally patch PaliGemma/Gemma/Siglip modules with Liger Triton kernels.
|
||||
|
||||
Must be called BEFORE PI052Policy is instantiated — Liger replaces
|
||||
classes inside ``transformers.models.{gemma,gemma2,siglip,paligemma}``,
|
||||
so any model built after the call picks up the fused forwards.
|
||||
|
||||
``spec`` is a comma-separated subset of {rope, rms_norm, geglu,
|
||||
layer_norm} (also ``all`` and ``none``). ``cross_entropy`` and
|
||||
``fused_linear_cross_entropy`` are intentionally skipped — pi052's
|
||||
losses use ``F.cross_entropy`` directly (not ``nn.CrossEntropyLoss``)
|
||||
and never traverse ``PaliGemmaForConditionalGeneration.forward``,
|
||||
so neither patch would fire without invasive model-code changes.
|
||||
"""
|
||||
enabled = dict.fromkeys(_LIGER_SUBKERNELS, False)
|
||||
if spec in ("", "none"):
|
||||
return enabled
|
||||
tokens = [t.strip() for t in spec.split(",") if t.strip()]
|
||||
if tokens == ["all"]:
|
||||
enabled = dict.fromkeys(_LIGER_SUBKERNELS, True)
|
||||
else:
|
||||
for t in tokens:
|
||||
if t not in enabled:
|
||||
raise SystemExit(f"Unknown liger subkernel: {t!r}. Choose from {_LIGER_SUBKERNELS} or 'all'.")
|
||||
enabled[t] = True
|
||||
|
||||
from liger_kernel.transformers import apply_liger_kernel_to_paligemma
|
||||
|
||||
apply_liger_kernel_to_paligemma(
|
||||
rope=enabled["rope"],
|
||||
rms_norm=enabled["rms_norm"],
|
||||
geglu=enabled["geglu"],
|
||||
layer_norm=enabled["layer_norm"],
|
||||
cross_entropy=False,
|
||||
fused_linear_cross_entropy=False,
|
||||
)
|
||||
return enabled
|
||||
|
||||
|
||||
def _maybe_patch_flex() -> None:
|
||||
"""Swap ``sdpa_attention_forward`` for a FlexAttention-backed forward.
|
||||
|
||||
Experimental: builds a per-call ``score_mod`` from the additive
|
||||
mask and dispatches to a compiled ``flex_attention`` kernel.
|
||||
|
||||
Known issue on torch 2.7.1: dynamo errors out with
|
||||
``FlexAttentionHigherOrderVariable() has no type`` when the
|
||||
``score_mod`` closure captures a per-call bias tensor. A proper
|
||||
port needs ``create_block_mask(mask_mod, ...)`` plumbed at the
|
||||
PI05Pytorch.forward level so a BlockMask object can be passed
|
||||
down to the layer compute, not a per-call closure. Left as
|
||||
future work; keep this stub for benchmark experimentation."""
|
||||
import torch
|
||||
from torch.nn.attention.flex_attention import flex_attention
|
||||
|
||||
from lerobot.policies.pi05 import modeling_pi05
|
||||
|
||||
compiled_flex = torch.compile(flex_attention, dynamic=True)
|
||||
|
||||
def flex_forward(module, query, key, value, attention_mask, scaling, dropout=0.0):
|
||||
n_rep = module.num_key_value_groups
|
||||
if n_rep > 1:
|
||||
key = key.repeat_interleave(n_rep, dim=1)
|
||||
value = value.repeat_interleave(n_rep, dim=1)
|
||||
|
||||
bias = attention_mask # (B, 1, Lq, Lk) additive
|
||||
|
||||
def score_mod(score, b, h, q_idx, kv_idx):
|
||||
return score + bias[b, 0, q_idx, kv_idx]
|
||||
|
||||
attn_output = compiled_flex(query, key, value, score_mod=score_mod, scale=scaling)
|
||||
return attn_output.transpose(1, 2).contiguous(), None
|
||||
|
||||
modeling_pi05.sdpa_attention_forward = flex_forward
|
||||
|
||||
|
||||
def _build_policy(args, device: torch.device):
|
||||
"""Random-init PI052Policy at production-relevant shapes."""
|
||||
from lerobot.configs.types import FeatureType, PolicyFeature
|
||||
from lerobot.policies.pi052.configuration_pi052 import PI052Config
|
||||
from lerobot.policies.pi052.modeling_pi052 import PI052Policy
|
||||
|
||||
# Production has ``unfreeze_lm_head=True`` + ``text_loss_weight>0``,
|
||||
# which flips ``train_expert_only=False`` in __post_init__ and
|
||||
# makes the whole PaliGemma + Gemma-expert stack trainable. We
|
||||
# mirror that here so the optimizer-state count reflects reality;
|
||||
# the loss path still goes through ``PI05Policy.forward`` because
|
||||
# ``text_labels`` / FAST tokens are absent from the synthetic batch
|
||||
# (see ``PI052Policy.forward`` early-return).
|
||||
config = PI052Config(
|
||||
max_action_dim=args.action_dim,
|
||||
max_state_dim=args.state_dim,
|
||||
dtype=args.dtype,
|
||||
knowledge_insulation=args.knowledge_insulation,
|
||||
text_loss_weight=1e-3 if args.train_full else 0.0,
|
||||
flow_loss_weight=1.0,
|
||||
enable_fast_action_loss=False,
|
||||
unfreeze_lm_head=args.train_full,
|
||||
tokenizer_max_length=args.lang_tokens,
|
||||
device="cuda",
|
||||
compile_model=args.compile_model,
|
||||
compile_mode=args.compile_mode,
|
||||
)
|
||||
config.input_features = {
|
||||
"observation.state": PolicyFeature(type=FeatureType.STATE, shape=(args.state_dim,)),
|
||||
"observation.images.base_0_rgb": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
|
||||
}
|
||||
config.output_features = {
|
||||
"action": PolicyFeature(type=FeatureType.ACTION, shape=(args.action_dim,)),
|
||||
}
|
||||
policy = PI052Policy(config)
|
||||
policy.to(device)
|
||||
if args.gradient_checkpointing:
|
||||
policy.model.gradient_checkpointing_enable()
|
||||
policy.train()
|
||||
return policy, config
|
||||
|
||||
|
||||
def _build_batch(args, config, device: torch.device) -> dict:
|
||||
"""Synthetic batch matching the training-loop input contract."""
|
||||
from lerobot.utils.constants import (
|
||||
ACTION,
|
||||
OBS_LANGUAGE_ATTENTION_MASK,
|
||||
OBS_LANGUAGE_TOKENS,
|
||||
)
|
||||
|
||||
B = args.batch_size
|
||||
L = args.lang_tokens
|
||||
return {
|
||||
OBS_LANGUAGE_TOKENS: torch.randint(0, 250000, (B, L), device=device),
|
||||
OBS_LANGUAGE_ATTENTION_MASK: torch.ones(B, L, dtype=torch.bool, device=device),
|
||||
"observation.images.base_0_rgb": torch.rand(B, 3, 224, 224, device=device),
|
||||
"observation.images.base_0_rgb_padding_mask": torch.ones(B, dtype=torch.bool, device=device),
|
||||
"observation.state": torch.randn(B, args.state_dim, device=device),
|
||||
ACTION: torch.randn(B, config.chunk_size, args.action_dim, device=device),
|
||||
"action_is_pad": torch.zeros(B, config.chunk_size, dtype=torch.bool, device=device),
|
||||
"task": ["bench task"] * B,
|
||||
}
|
||||
|
||||
|
||||
def _step(policy, batch, optimizer=None) -> torch.Tensor:
|
||||
loss, _ = policy.forward(batch)
|
||||
loss.backward()
|
||||
if optimizer is not None:
|
||||
optimizer.step()
|
||||
optimizer.zero_grad(set_to_none=True)
|
||||
else:
|
||||
for p in policy.parameters():
|
||||
if p.grad is not None:
|
||||
p.grad = None
|
||||
return loss.detach()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--attn", choices=["sdpa", "eager", "flex"], default="sdpa")
|
||||
parser.add_argument(
|
||||
"--kernels",
|
||||
default="none",
|
||||
help=(
|
||||
"Liger sub-kernels to enable, comma-separated. Choose from "
|
||||
f"{_LIGER_SUBKERNELS} or use 'all' / 'none' (default). Applied "
|
||||
"via apply_liger_kernel_to_paligemma() BEFORE model build."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compile",
|
||||
dest="compile_model",
|
||||
action="store_true",
|
||||
help="Set policy.config.compile_model=True (torch.compile the forward).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compile-mode",
|
||||
default="default",
|
||||
help="torch.compile mode (default | reduce-overhead | max-autotune).",
|
||||
)
|
||||
parser.add_argument("--batch-size", type=int, default=8)
|
||||
parser.add_argument("--warmup", type=int, default=8)
|
||||
parser.add_argument("--steps", type=int, default=40)
|
||||
parser.add_argument("--lang-tokens", type=int, default=512)
|
||||
parser.add_argument("--dtype", choices=["bfloat16", "float32"], default="bfloat16")
|
||||
parser.add_argument("--action-dim", type=int, default=14)
|
||||
parser.add_argument("--state-dim", type=int, default=14)
|
||||
parser.add_argument("--knowledge-insulation", action="store_true", default=True)
|
||||
parser.add_argument(
|
||||
"--gradient-checkpointing",
|
||||
dest="gradient_checkpointing",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--optimizer",
|
||||
choices=["none", "adamw", "adamw_fused"],
|
||||
default="adamw_fused",
|
||||
help=(
|
||||
"Whether to include an AdamW step in the timed iteration. "
|
||||
"'none' mirrors the fwd+bwd-only original bench; 'adamw' / "
|
||||
"'adamw_fused' add the realistic ~2x param-bytes optimizer "
|
||||
"state and ``optimizer.step()`` cost."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train-full",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=True,
|
||||
help=(
|
||||
"Mirror production: unfreeze the PaliGemma backbone (full "
|
||||
"~3B trainable params) instead of training only the 300M "
|
||||
"action expert."
|
||||
),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
raise SystemExit("Benchmark requires CUDA; submit via slurm (srun/sbatch).")
|
||||
|
||||
if args.attn == "eager":
|
||||
_maybe_patch_eager()
|
||||
elif args.attn == "flex":
|
||||
_maybe_patch_flex()
|
||||
|
||||
liger_flags = _maybe_patch_liger(args.kernels)
|
||||
|
||||
device = torch.device("cuda")
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
policy, config = _build_policy(args, device)
|
||||
batch = _build_batch(args, config, device)
|
||||
|
||||
optimizer = None
|
||||
trainable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
|
||||
if args.optimizer != "none":
|
||||
trainable = [p for p in policy.parameters() if p.requires_grad]
|
||||
optimizer = torch.optim.AdamW(
|
||||
trainable, lr=5e-5, fused=(args.optimizer == "adamw_fused")
|
||||
)
|
||||
|
||||
for _ in range(args.warmup):
|
||||
_step(policy, batch, optimizer)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
starter = torch.cuda.Event(enable_timing=True)
|
||||
ender = torch.cuda.Event(enable_timing=True)
|
||||
starter.record()
|
||||
for _ in range(args.steps):
|
||||
_step(policy, batch, optimizer)
|
||||
ender.record()
|
||||
torch.cuda.synchronize()
|
||||
total_ms = starter.elapsed_time(ender)
|
||||
step_ms = total_ms / args.steps
|
||||
peak_gb = torch.cuda.max_memory_allocated() / (1024**3)
|
||||
optim_gb = 0.0
|
||||
if optimizer is not None:
|
||||
for st in optimizer.state.values():
|
||||
for v in st.values():
|
||||
if torch.is_tensor(v):
|
||||
optim_gb += v.numel() * v.element_size() / (1024**3)
|
||||
|
||||
liger_on = ",".join(k for k, v in liger_flags.items() if v) or "none"
|
||||
name = (
|
||||
f"{args.attn:>5} | BS={args.batch_size} | L={args.lang_tokens} | "
|
||||
f"KI={args.knowledge_insulation} | GC={args.gradient_checkpointing} | "
|
||||
f"compile={args.compile_model} | liger={liger_on} | opt={args.optimizer} | dtype={args.dtype}"
|
||||
)
|
||||
print(
|
||||
f"{name}\n step_ms={step_ms:.1f} steps/sec={1000.0 / step_ms:.3f} "
|
||||
f"peak_mem={peak_gb:.2f} GiB optim_state={optim_gb:.2f} GiB "
|
||||
f"trainable_params={trainable_params / 1e9:.2f}B"
|
||||
)
|
||||
|
||||
del policy, batch
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench-pi052-attn
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --output=/fsx/pepijn/logs/bench_pi052_%j.out
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
|
||||
echo "=== Node: $(hostname) ==="
|
||||
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
|
||||
|
||||
python -c "import torch; print('torch', torch.__version__, 'cuda', torch.version.cuda)"
|
||||
|
||||
run() {
|
||||
echo
|
||||
echo "--- $* ---"
|
||||
python examples/benchmark/bench_pi052_step.py "$@" || true
|
||||
}
|
||||
|
||||
# Attention parity benchmark — same shapes, different attention kernel.
|
||||
run --attn eager --batch-size 8
|
||||
run --attn sdpa --batch-size 8
|
||||
|
||||
# Headroom benchmark — does SDPA's memory cut allow a bigger micro-batch?
|
||||
run --attn sdpa --batch-size 12
|
||||
run --attn sdpa --batch-size 16
|
||||
run --attn sdpa --batch-size 24
|
||||
@@ -1,39 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench-pi052-v2
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=00:45:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --output=/fsx/pepijn/logs/bench_pi052_v2_%j.out
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
|
||||
echo "=== Node: $(hostname) ==="
|
||||
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
|
||||
|
||||
run() {
|
||||
echo
|
||||
echo "--- $* ---"
|
||||
python examples/benchmark/bench_pi052_step.py "$@" || true
|
||||
}
|
||||
|
||||
# A: GC ON — see if the selective-AC change (one less recompute level)
|
||||
# narrows the eager vs SDPA gap at BS=8.
|
||||
run --attn eager --batch-size 8
|
||||
run --attn sdpa --batch-size 8
|
||||
|
||||
# B: GC OFF — isolate the raw attention-kernel cost & memory delta.
|
||||
run --attn eager --batch-size 4 --no-gradient-checkpointing
|
||||
run --attn sdpa --batch-size 4 --no-gradient-checkpointing
|
||||
|
||||
# C: SDPA + GC headroom sweep — where does it OOM?
|
||||
run --attn sdpa --batch-size 16
|
||||
run --attn sdpa --batch-size 24
|
||||
run --attn sdpa --batch-size 32
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench-pi052-v3
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=00:45:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --output=/fsx/pepijn/logs/bench_pi052_v3_%j.out
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
|
||||
echo "=== Node: $(hostname) ==="
|
||||
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
|
||||
|
||||
run() {
|
||||
echo
|
||||
echo "--- $* ---"
|
||||
python examples/benchmark/bench_pi052_step.py "$@" || true
|
||||
}
|
||||
|
||||
# Compile sweep: does torch.compile + SDPA give a non-trivial boost on
|
||||
# top of the bare SDPA path?
|
||||
run --attn sdpa --batch-size 8 --compile
|
||||
run --attn sdpa --batch-size 16 --compile
|
||||
|
||||
# FlexAttention sweep (experimental): score_mod adds the additive bias
|
||||
# in-kernel; expect a long first-step compile, then SDPA-or-better steady
|
||||
# state.
|
||||
run --attn flex --batch-size 8
|
||||
run --attn flex --batch-size 16
|
||||
@@ -1,41 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench-pi052-v4
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=01:00:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --output=/fsx/pepijn/logs/bench_pi052_v4_%j.out
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
|
||||
# /fsx triton cache is shared across nodes with different glibc versions
|
||||
# — kernels built on one node trip GLIBC_2.34-not-found on another. Use
|
||||
# a node-local cache per job to side-step that.
|
||||
export TRITON_CACHE_DIR="/tmp/triton_${SLURM_JOB_ID}"
|
||||
export TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${SLURM_JOB_ID}"
|
||||
mkdir -p "$TRITON_CACHE_DIR" "$TORCHINDUCTOR_CACHE_DIR"
|
||||
|
||||
echo "=== Node: $(hostname) ==="
|
||||
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
|
||||
ldd --version | head -1
|
||||
|
||||
run() {
|
||||
echo
|
||||
echo "--- $* ---"
|
||||
python examples/benchmark/bench_pi052_step.py "$@" || true
|
||||
}
|
||||
|
||||
# compile path on top of SDPA + selective AC
|
||||
run --attn sdpa --batch-size 8 --compile
|
||||
run --attn sdpa --batch-size 16 --compile
|
||||
|
||||
# FlexAttention experimental
|
||||
run --attn flex --batch-size 8
|
||||
run --attn flex --batch-size 16
|
||||
@@ -1,33 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench-pi052-v5
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=00:45:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --output=/fsx/pepijn/logs/bench_pi052_v5_%j.out
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
export TRITON_CACHE_DIR="/tmp/triton_${SLURM_JOB_ID}"
|
||||
export TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${SLURM_JOB_ID}"
|
||||
mkdir -p "$TRITON_CACHE_DIR" "$TORCHINDUCTOR_CACHE_DIR"
|
||||
|
||||
echo "=== Node: $(hostname) ==="
|
||||
|
||||
run() {
|
||||
echo
|
||||
echo "--- $* ---"
|
||||
python examples/benchmark/bench_pi052_step.py "$@" || true
|
||||
}
|
||||
|
||||
# compile_mode=default (graph-only, no autotune) is the right knob with
|
||||
# gradient checkpointing — max-autotune in v4 was 2x slower than no-compile.
|
||||
run --attn sdpa --batch-size 8 --compile --compile-mode default
|
||||
run --attn sdpa --batch-size 16 --compile --compile-mode default
|
||||
run --attn sdpa --batch-size 8 --compile --compile-mode reduce-overhead
|
||||
@@ -1,31 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench-pi052-v6-bs32
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --output=/fsx/pepijn/logs/bench_pi052_v6_%j.out
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
export TRITON_CACHE_DIR="/tmp/triton_${SLURM_JOB_ID}"
|
||||
export TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${SLURM_JOB_ID}"
|
||||
mkdir -p "$TRITON_CACHE_DIR" "$TORCHINDUCTOR_CACHE_DIR"
|
||||
|
||||
echo "=== Node: $(hostname) ==="
|
||||
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
|
||||
|
||||
run() {
|
||||
echo
|
||||
echo "--- $* ---"
|
||||
python examples/benchmark/bench_pi052_step.py "$@" || true
|
||||
}
|
||||
|
||||
# BS=32 with the production settings (SDPA + compile=default).
|
||||
run --attn sdpa --batch-size 32 --compile --compile-mode default
|
||||
@@ -1,39 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench-pi052-v7-opt
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=00:45:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --output=/fsx/pepijn/logs/bench_pi052_v7_%j.out
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
export TRITON_CACHE_DIR="/tmp/triton_${SLURM_JOB_ID}"
|
||||
export TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${SLURM_JOB_ID}"
|
||||
mkdir -p "$TRITON_CACHE_DIR" "$TORCHINDUCTOR_CACHE_DIR"
|
||||
|
||||
echo "=== Node: $(hostname) ==="
|
||||
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
|
||||
|
||||
run() {
|
||||
echo
|
||||
echo "--- $* ---"
|
||||
python examples/benchmark/bench_pi052_step.py "$@" || true
|
||||
}
|
||||
|
||||
# Realistic full-step memory: fwd + bwd + AdamW step. The original
|
||||
# sweep was fwd+bwd-only and undercounted memory by the optimizer-
|
||||
# state size (~2x param bytes for AdamW). This run confirms BS=16
|
||||
# and BS=32 still fit with the optimizer in residency.
|
||||
run --attn sdpa --batch-size 16 --compile --compile-mode default --optimizer adamw_fused
|
||||
run --attn sdpa --batch-size 32 --compile --compile-mode default --optimizer adamw_fused
|
||||
|
||||
# Without compile, in case the production cluster has compile issues.
|
||||
run --attn sdpa --batch-size 16 --optimizer adamw_fused
|
||||
run --attn sdpa --batch-size 32 --optimizer adamw_fused
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=bench-pi052-v8-bs40-dtype
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=00:45:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --output=/fsx/pepijn/logs/bench_pi052_v8_%j.out
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
export TRITON_CACHE_DIR="/tmp/triton_${SLURM_JOB_ID}"
|
||||
export TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${SLURM_JOB_ID}"
|
||||
mkdir -p "$TRITON_CACHE_DIR" "$TORCHINDUCTOR_CACHE_DIR"
|
||||
|
||||
echo "=== Node: $(hostname) ==="
|
||||
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
|
||||
|
||||
run() {
|
||||
echo
|
||||
echo "--- $* ---"
|
||||
python examples/benchmark/bench_pi052_step.py "$@" || true
|
||||
}
|
||||
|
||||
# Confirm BS=40 fits on a single H100 with the optimizer in residency.
|
||||
run --attn sdpa --batch-size 40 --compile --compile-mode default --optimizer adamw_fused
|
||||
|
||||
# Dtype A/B at modest batch — fp32 needs ~2x the memory of bf16, so we
|
||||
# drop to BS=4 to keep both runs comparable instead of OOMing fp32.
|
||||
run --attn sdpa --batch-size 4 --optimizer adamw_fused --dtype bfloat16
|
||||
run --attn sdpa --batch-size 4 --optimizer adamw_fused --dtype float32
|
||||
@@ -1,29 +0,0 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
enable_cpu_affinity: false
|
||||
fsdp_config:
|
||||
fsdp_activation_checkpointing: false
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_backward_prefetch: BACKWARD_PRE
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_forward_prefetch: false
|
||||
fsdp_offload_params: false
|
||||
fsdp_reshard_after_forward: true
|
||||
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_transformer_layer_cls_to_wrap: GemmaDecoderLayer,SiglipEncoderLayer
|
||||
fsdp_use_orig_params: true
|
||||
fsdp_version: 2
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 8
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,162 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Emit the ``--dataset.episodes`` include-list for a LeRobot dataset, minus a
|
||||
set of excluded episode indices.
|
||||
|
||||
``LeRobotDatasetConfig.episodes`` is an *include* list (train only on the listed
|
||||
episodes), so "exclude episode X" means "pass every episode except X". This
|
||||
helper builds that complement.
|
||||
|
||||
For ``pepijn223/robocasa_pretrain_human300_v4`` the default exclusion set is the
|
||||
63 episodes that carry NO ``subtask`` annotation (in fact no persistent language
|
||||
rows at all) — see the scan in this PR's discussion. Training the steerable
|
||||
SmolVLA/pi052 policy on those episodes would feed it frames with empty subtask
|
||||
targets, so we drop them.
|
||||
|
||||
Usage (prints a compact ``[0,1,2,...]`` list to stdout, logs to stderr):
|
||||
|
||||
python scripts/build_episode_filter.py \
|
||||
--repo-id pepijn223/robocasa_pretrain_human300_v4
|
||||
|
||||
# capture in a shell script
|
||||
EPISODES=$(python scripts/build_episode_filter.py --repo-id <id>)
|
||||
lerobot-train ... --dataset.episodes="$EPISODES"
|
||||
|
||||
The helper reads ``meta/info.json`` from the Hub to learn ``total_episodes`` and
|
||||
validates that every excluded index is in ``[0, total_episodes)`` before emitting
|
||||
the complement. Pass ``--no-validate-hub`` to skip the network round-trip and use
|
||||
``--total-episodes`` directly (e.g. for an offline / local dataset).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
# Episodes in pepijn223/robocasa_pretrain_human300_v4 with no `subtask`
|
||||
# annotation (no persistent language rows at all). 63 episodes / 179,009 frames.
|
||||
DEFAULT_EXCLUDE: tuple[int, ...] = (
|
||||
1065, 2972, 6971, 8129, 9167, 9170, 9171, 9177, 9190, 9196, 9199, 9204,
|
||||
9207, 9208, 9210, 9217, 9232, 9234, 9240, 9243, 9254, 9256, 9258, 9259,
|
||||
9261, 9263, 9264, 15928, 16350, 18729, 20026, 21703, 25314, 25319, 25321,
|
||||
25324, 25333, 25340, 25356, 25366, 25374, 25388, 25392, 25825, 25893,
|
||||
26347, 26357, 26374, 26375, 26388, 26394, 26398, 26400, 26409, 26422,
|
||||
26423, 26426, 26895, 26905, 26915, 26954, 27064, 30812,
|
||||
)
|
||||
|
||||
|
||||
def _log(msg: str) -> None:
|
||||
print(msg, file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def _total_episodes_from_hub(repo_id: str, revision: str | None) -> int:
|
||||
"""Return ``total_episodes`` from the dataset's ``meta/info.json`` on the Hub."""
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
path = hf_hub_download(
|
||||
repo_id=repo_id,
|
||||
filename="meta/info.json",
|
||||
repo_type="dataset",
|
||||
revision=revision,
|
||||
)
|
||||
with open(path) as f:
|
||||
info = json.load(f)
|
||||
total = int(info["total_episodes"])
|
||||
if total <= 0:
|
||||
raise ValueError(f"info.json reports non-positive total_episodes={total!r}")
|
||||
return total
|
||||
|
||||
|
||||
def build_include_list(total_episodes: int, exclude: set[int]) -> list[int]:
|
||||
"""Return ``[0, total_episodes)`` with ``exclude`` removed, ascending."""
|
||||
out_of_range = sorted(e for e in exclude if e < 0 or e >= total_episodes)
|
||||
if out_of_range:
|
||||
raise ValueError(
|
||||
f"{len(out_of_range)} excluded index(es) outside [0, {total_episodes}): "
|
||||
f"{out_of_range[:10]}{'...' if len(out_of_range) > 10 else ''}. "
|
||||
"The dataset may have changed — re-run the subtask scan before training."
|
||||
)
|
||||
return [e for e in range(total_episodes) if e not in exclude]
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
p.add_argument("--repo-id", default="pepijn223/robocasa_pretrain_human300_v4")
|
||||
p.add_argument("--revision", default=None, help="Dataset revision/branch (default: main).")
|
||||
p.add_argument(
|
||||
"--exclude-file",
|
||||
default=None,
|
||||
help="Optional JSON file with a list of episode indices to exclude. "
|
||||
"Overrides the built-in default set.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--total-episodes",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Total episode count. If omitted, read from meta/info.json on the Hub.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-validate-hub",
|
||||
action="store_true",
|
||||
help="Do not fetch info.json from the Hub; requires --total-episodes.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--out",
|
||||
default=None,
|
||||
help="Write the list to this file instead of stdout.",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
if args.exclude_file:
|
||||
with open(args.exclude_file) as f:
|
||||
data = json.load(f)
|
||||
# Accept either a bare list or the {"missing_episode_indices": [...]} report shape.
|
||||
exclude = set(data["missing_episode_indices"] if isinstance(data, dict) else data)
|
||||
else:
|
||||
exclude = set(DEFAULT_EXCLUDE)
|
||||
|
||||
if args.total_episodes is not None:
|
||||
total = args.total_episodes
|
||||
if not args.no_validate_hub:
|
||||
hub_total = _total_episodes_from_hub(args.repo_id, args.revision)
|
||||
if hub_total != total:
|
||||
raise ValueError(
|
||||
f"--total-episodes={total} disagrees with Hub info.json total_episodes={hub_total}."
|
||||
)
|
||||
else:
|
||||
if args.no_validate_hub:
|
||||
raise SystemExit("--no-validate-hub requires --total-episodes.")
|
||||
total = _total_episodes_from_hub(args.repo_id, args.revision)
|
||||
|
||||
include = build_include_list(total, exclude)
|
||||
_log(
|
||||
f"[build_episode_filter] repo={args.repo_id} total={total} "
|
||||
f"excluded={len(exclude)} kept={len(include)}"
|
||||
)
|
||||
|
||||
# Compact JSON (no spaces) so the resulting CLI arg stays as short as possible.
|
||||
payload = "[" + ",".join(map(str, include)) + "]"
|
||||
if args.out:
|
||||
with open(args.out, "w") as f:
|
||||
f.write(payload)
|
||||
_log(f"[build_episode_filter] wrote {len(payload)} bytes to {args.out}")
|
||||
else:
|
||||
sys.stdout.write(payload)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,47 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Build a tiny RoboCasa smoke dataset (2 short atomic tasks, all episodes) for
|
||||
# fast end-to-end training validation before the real run.
|
||||
#
|
||||
# Defaults: target/human, OpenStandMixerHead + NavigateKitchen (~1k episodes,
|
||||
# ~131k frames, ~109 min @ 20 fps), 2 SLURM workers on hopper-cpu.
|
||||
#
|
||||
# Override via env: TASKS, REPO_ID, WORK_DIR, WORKERS, CPUS, PARTITION, LOCAL=1.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
source ~/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate lerobot
|
||||
|
||||
REPO_ID="${REPO_ID:-${HF_USER:?HF_USER is unset}/robocasa_smoke_2atomic_v3}"
|
||||
WORK_DIR="${WORK_DIR:-/fsx/${USER}/robocasa/datasets/v1.0}"
|
||||
ROBOCASA_ROOT="${ROBOCASA_ROOT:-/fsx/${USER}/robocasa}"
|
||||
LOGS_DIR="${LOGS_DIR:-/fsx/${USER}/logs/robocasa}"
|
||||
TASKS="${TASKS:-OpenStandMixerHead NavigateKitchen}"
|
||||
WORKERS="${WORKERS:-2}"
|
||||
CPUS="${CPUS:-8}"
|
||||
PARTITION="${PARTITION:-hopper-cpu}"
|
||||
LOCAL="${LOCAL:-0}"
|
||||
|
||||
ARGS=(
|
||||
examples/port_datasets/slurm_build_robocasa_composite_seen.py
|
||||
--repo-id="$REPO_ID"
|
||||
--work-dir="$WORK_DIR"
|
||||
--robocasa-root="$ROBOCASA_ROOT"
|
||||
--split=target --source=human
|
||||
--tasks $TASKS
|
||||
--workers="$WORKERS"
|
||||
--cpus-per-task="$CPUS"
|
||||
--partition="$PARTITION"
|
||||
--mem-per-cpu=4G
|
||||
--time=04:00:00
|
||||
--logs-dir="$LOGS_DIR"
|
||||
--job-name=port_robocasa_smoke
|
||||
)
|
||||
if [[ "$LOCAL" == "1" ]]; then
|
||||
ARGS+=(--slurm=0)
|
||||
fi
|
||||
|
||||
echo "Smoke dataset: $REPO_ID"
|
||||
echo "Tasks: $TASKS"
|
||||
python "${ARGS[@]}"
|
||||
@@ -1,115 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=pi052-hirobot-robocasa-human300
|
||||
#SBATCH --partition=hopper-prod
|
||||
#SBATCH --qos=high
|
||||
#SBATCH --time=48:00:00
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --gpus-per-task=8
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "${LEROBOT_ROOT:-$HOME/lerobot}"
|
||||
|
||||
export LEROBOT_DEBUG_PREDS_EVERY=1000
|
||||
export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}"
|
||||
export NCCL_TIMEOUT="${NCCL_TIMEOUT:-1800}"
|
||||
export HF_HUB_DOWNLOAD_TIMEOUT="${HF_HUB_DOWNLOAD_TIMEOUT:-120}"
|
||||
export WANDB_INIT_TIMEOUT="${WANDB_INIT_TIMEOUT:-300}"
|
||||
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
|
||||
# Compile path: pin triton + inductor caches node-local. The shared
|
||||
# /fsx cache mixes kernels built against different glibc versions and
|
||||
# trips ``GLIBC_2.34 not found`` on hopper nodes (bench v3 confirmed).
|
||||
export TRITON_CACHE_DIR="/tmp/triton_${SLURM_JOB_ID}"
|
||||
export TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${SLURM_JOB_ID}"
|
||||
mkdir -p "$TRITON_CACHE_DIR" "$TORCHINDUCTOR_CACHE_DIR"
|
||||
|
||||
# Non-fatal so an unstaged local hotfix doesn't kill the job. CI / clean
|
||||
# checkouts still fast-forward as before; dirty trees just keep their
|
||||
# in-flight changes (the working tree is what runs).
|
||||
git pull --ff-only || echo "[warn] git pull skipped — keeping working tree."
|
||||
python -m pip install -q --upgrade -e .
|
||||
python -m pip install -q --upgrade -e '.[pi]'
|
||||
python -m pip install -q --upgrade 'liger-kernel'
|
||||
|
||||
# FlashAttention-2 is NOT installed. The pi052 dual-expert layer compute
|
||||
# uses SDPA (the block-bidirectional mask is unsupported by FA2 anyway),
|
||||
# and the only other consumer would be liger-kernel — which gracefully
|
||||
# degrades when flash_attn is absent. The previously-installed wheel was
|
||||
# built against a newer GLIBC than some hopper compute nodes provide
|
||||
# (job 22162586 on ip-26-0-162-14 hit ``GLIBC_2.32 not found``), so the
|
||||
# safest configuration is "not installed". To re-enable for the
|
||||
# downstream HF Gemma ``generate`` path, install a wheel matching the
|
||||
# node's libc — but verify on every assigned node first.
|
||||
|
||||
DATASET="pepijn223/robocasa_pretrain_human300_v4"
|
||||
DATASET_REVISION="${DATASET_REVISION:-main}"
|
||||
POLICY_REPO_ID="pepijn223/pi052_robocasa_human300"
|
||||
JOB_NAME="pi052-hirobot-robocasa-human300"
|
||||
NUM_PROCESSES=8
|
||||
# BS=36 — fits ~72 GB / 80 GB, BS=36 × 8 GPUs = 288 effective.
|
||||
BATCH_SIZE=${BATCH_SIZE:-36}
|
||||
STEPS=${STEPS:-5000}
|
||||
RUN_ID="${SLURM_JOB_ID:-$(date +%Y%m%d_%H%M%S)}"
|
||||
OUTPUT_DIR="/fsx/pepijn/outputs/train/pi052_robocasa_human300_${RUN_ID}"
|
||||
|
||||
# --- Exclude un-annotated episodes -----------------------------------------
|
||||
# 63 episodes in this dataset carry NO `subtask` annotation (no persistent
|
||||
# language rows at all). `--dataset.episodes` is an INCLUDE list, so we pass
|
||||
# the complement: every episode index except those 63. The helper reads
|
||||
# meta/info.json from the Hub to confirm total_episodes (32043) and validates
|
||||
# the excluded indices are in range before emitting the list. If the dataset
|
||||
# version changes such that the indices fall out of range, the helper aborts
|
||||
# the job rather than silently training on the wrong episodes.
|
||||
echo "Building episode include-list (excluding un-annotated episodes)..."
|
||||
EPISODES=$(python scripts/build_episode_filter.py \
|
||||
--repo-id "$DATASET" \
|
||||
--revision "$DATASET_REVISION")
|
||||
|
||||
echo "Training pi052 on $DATASET with ${NUM_PROCESSES} GPUs, batch size ${BATCH_SIZE}/GPU, ${STEPS} steps"
|
||||
echo "Output directory: $OUTPUT_DIR"
|
||||
export LEROBOT_DUMP_RECIPE_SAMPLES=8
|
||||
|
||||
accelerate launch --multi_gpu --num_processes="$NUM_PROCESSES" \
|
||||
-m lerobot.scripts.lerobot_train \
|
||||
--policy.type=pi052 \
|
||||
--policy.pretrained_path=lerobot/pi05_base \
|
||||
--policy.recipe_path=recipes/subtask_mem_vqa_robocasa.yaml \
|
||||
--dataset.repo_id="$DATASET" \
|
||||
--dataset.revision="$DATASET_REVISION" \
|
||||
--dataset.episodes="$EPISODES" \
|
||||
--dataset.video_backend=pyav \
|
||||
--output_dir="$OUTPUT_DIR" \
|
||||
--job_name="$JOB_NAME" \
|
||||
--policy.repo_id="$POLICY_REPO_ID" \
|
||||
--policy.compile_model=true \
|
||||
--policy.compile_mode=default \
|
||||
--policy.gradient_checkpointing=true \
|
||||
--policy.device=cuda \
|
||||
--policy.tokenizer_max_length=256 \
|
||||
--policy.action_tokenizer_name=lerobot/fast-action-tokenizer \
|
||||
--policy.chunk_size=30 \
|
||||
--policy.n_action_steps=30 \
|
||||
--policy.max_action_tokens=256 \
|
||||
--steps="$STEPS" \
|
||||
--policy.scheduler_decay_steps="$STEPS" \
|
||||
--batch_size="$BATCH_SIZE" \
|
||||
--wandb.enable=true \
|
||||
--policy.dtype=bfloat16 \
|
||||
--policy.optimizer_lr=5e-5 \
|
||||
--policy.optimizer_grad_clip_norm=1.0 \
|
||||
--policy.scheduler_decay_lr=5e-6 \
|
||||
--policy.lm_head_lr_scale=5.0 \
|
||||
--ema.enable=true \
|
||||
--wandb.disable_artifact=true \
|
||||
--wandb.project=hirobot \
|
||||
--log_freq=100 \
|
||||
--save_freq=5000 \
|
||||
--num_workers=4 \
|
||||
--prefetch_factor=4 \
|
||||
--persistent_workers=true \
|
||||
--dataset.image_transforms.enable=true \
|
||||
--dataset.image_transforms.max_num_transforms=3 \
|
||||
--dataset.image_transforms.random_order=true \
|
||||
--policy.auto_fit_fast_tokenizer=true \
|
||||
--policy.knowledge_insulation=true
|
||||
@@ -1,345 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build a single combined LeRobotDataset from RoboCasa's 16 composite_seen tasks.
|
||||
|
||||
RoboCasa 1.0 already ships in LeRobot format (parquet + mp4), distributed as
|
||||
``lerobot.tar`` archives from Box. This script:
|
||||
|
||||
1. Downloads each composite_seen task's ``target/human`` archive via RoboCasa's
|
||||
official ``download_datasets`` helper (idempotent — skipped if already on
|
||||
disk).
|
||||
2. Opens each extracted directory as a ``LeRobotDataset``.
|
||||
3. Merges all 16 into one unified dataset via ``merge_datasets`` (a thin wrapper
|
||||
over ``aggregate_datasets`` that revalidates fps / robot_type / features,
|
||||
unifies task indices, concatenates videos and parquet, and recomputes stats).
|
||||
4. Optionally pushes the merged dataset to the Hub.
|
||||
|
||||
The result is one ~8,000-trajectory dataset where each episode carries its
|
||||
source task as the ``task`` field — ready for downstream annotation
|
||||
(subtasks / memory / VQA / tool calls) without per-task bookkeeping.
|
||||
|
||||
Usage::
|
||||
|
||||
uv run python -m lerobot.scripts.build_robocasa_composite_seen \\
|
||||
--output-dir=/data/lerobot/robocasa_composite_seen \\
|
||||
--hub-repo-id=${HF_USER}/robocasa_composite_seen \\
|
||||
--push-to-hub
|
||||
|
||||
Prereqs: ``robocasa`` and ``robosuite`` installed (see
|
||||
``docs/source/benchmarks/robocasa.mdx`` for the editable-install dance — they
|
||||
are not on PyPI and RoboCasa's own ``setup.py`` pins an old LeRobot version).
|
||||
|
||||
The 16 composite_seen tasks are the multi-step subset of the official
|
||||
RoboCasa365 target benchmark — exactly the slice used to compute the
|
||||
``Composite-Seen`` column of the leaderboard.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from lerobot.datasets.dataset_tools import merge_datasets
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Canonical 16 composite_seen tasks (RoboCasa365 target benchmark).
|
||||
# Order matches the leaderboard docs.
|
||||
COMPOSITE_SEEN_TASKS: list[str] = [
|
||||
"DeliverStraw",
|
||||
"GetToastedBread",
|
||||
"KettleBoiling",
|
||||
"LoadDishwasher",
|
||||
"PackIdenticalLunches",
|
||||
"PreSoakPan",
|
||||
"PrepareCoffee",
|
||||
"RinseSinkBasin",
|
||||
"ScrubCuttingBoard",
|
||||
"SearingMeat",
|
||||
"SetUpCuttingStation",
|
||||
"StackBowlsCabinet",
|
||||
"SteamInMicrowave",
|
||||
"StirVegetables",
|
||||
"StoreLeftoversInBowl",
|
||||
"WashLettuce",
|
||||
]
|
||||
|
||||
|
||||
def _require_robocasa() -> None:
|
||||
"""Fail fast with an actionable message if robocasa is missing.
|
||||
|
||||
RoboCasa is not on PyPI and is not a LeRobot extra — see the installation
|
||||
notes in ``docs/source/benchmarks/robocasa.mdx``.
|
||||
"""
|
||||
try:
|
||||
import robocasa # noqa: F401, PLC0415
|
||||
from robocasa.scripts import download_datasets as _dl # noqa: F401, PLC0415
|
||||
from robocasa.utils import dataset_registry as _reg # noqa: F401, PLC0415
|
||||
except ImportError as exc:
|
||||
sys.exit(
|
||||
"[build_robocasa_composite_seen] robocasa is not importable.\n"
|
||||
"Install it (and robosuite) per the LeRobot RoboCasa docs:\n"
|
||||
" git clone https://github.com/robocasa/robocasa.git ~/robocasa\n"
|
||||
" git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite\n"
|
||||
" pip install -e ~/robocasa --no-deps\n"
|
||||
" pip install -e ~/robosuite\n"
|
||||
f"(original error: {exc})"
|
||||
)
|
||||
|
||||
|
||||
def _resolve_task_root(task: str) -> Path:
|
||||
"""Resolve the local extracted ``LeRobotDataset`` root for a target/human task.
|
||||
|
||||
Uses RoboCasa's own ``dataset_registry`` so we follow whatever directory
|
||||
layout RoboCasa picks (currently ``v1.0/target/composite/<task>/<date>/``
|
||||
under ``robocasa.macros.DATASET_BASE_DIR``). Falls back to discovering the
|
||||
extracted directory if the helper's signature drifted between releases.
|
||||
"""
|
||||
from robocasa.utils import dataset_registry # noqa: PLC0415
|
||||
|
||||
# ``get_ds_path`` is the canonical helper. RoboCasa 1.0 signature is
|
||||
# ``get_ds_path(task, ds_type, return_info=False)`` with ``ds_type`` like
|
||||
# ``"human_im"`` (image-observation human demos). We try the common
|
||||
# ``split=`` kwarg first (newer registry); if it's rejected, fall back.
|
||||
try:
|
||||
ds_path = dataset_registry.get_ds_path(
|
||||
task=task,
|
||||
ds_type="human_im",
|
||||
return_info=False,
|
||||
split="target",
|
||||
)
|
||||
except TypeError:
|
||||
# Older registry — ds_type alone disambiguates target/human.
|
||||
ds_path = dataset_registry.get_ds_path(
|
||||
task=task,
|
||||
ds_type="human_im",
|
||||
return_info=False,
|
||||
)
|
||||
|
||||
root = Path(ds_path)
|
||||
# ``get_ds_path`` may return either the extracted dir or the .tar; normalize.
|
||||
if root.suffix == ".tar":
|
||||
root = root.parent
|
||||
return root
|
||||
|
||||
|
||||
def _download_task(task: str, *, overwrite: bool = False) -> Path:
|
||||
"""Download (or locate) a single target/human task and return its extracted root."""
|
||||
from robocasa.scripts import download_datasets as dl # noqa: PLC0415
|
||||
|
||||
# Try the documented programmatic API. The CLI is
|
||||
# python -m robocasa.scripts.download_datasets --tasks <T> --source human --split target
|
||||
# which is a thin wrapper over a function of the same name.
|
||||
if hasattr(dl, "download_datasets"):
|
||||
try:
|
||||
dl.download_datasets(
|
||||
tasks=[task],
|
||||
source="human",
|
||||
split="target",
|
||||
overwrite=overwrite,
|
||||
)
|
||||
except TypeError:
|
||||
# Older signature — drop the kwargs RoboCasa didn't have yet.
|
||||
dl.download_datasets(tasks=[task])
|
||||
else:
|
||||
# No public function — shell out to the CLI as a last resort. This
|
||||
# guarantees we use whatever entrypoint RoboCasa's authors maintain.
|
||||
import subprocess # noqa: PLC0415
|
||||
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"robocasa.scripts.download_datasets",
|
||||
"--tasks",
|
||||
task,
|
||||
"--source",
|
||||
"human",
|
||||
"--split",
|
||||
"target",
|
||||
]
|
||||
if overwrite:
|
||||
cmd.append("--overwrite")
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
root = _resolve_task_root(task)
|
||||
if not root.exists():
|
||||
raise RuntimeError(
|
||||
f"Expected {root} after download, but it doesn't exist. "
|
||||
"RoboCasa may have changed its data layout — verify with "
|
||||
"`robocasa.utils.dataset_registry.get_ds_path()`."
|
||||
)
|
||||
return root
|
||||
|
||||
|
||||
def _open_as_lerobot_dataset(task: str, root: Path) -> LeRobotDataset:
|
||||
"""Open an extracted RoboCasa target/human task as a ``LeRobotDataset``.
|
||||
|
||||
The placeholder ``repo_id`` (``robocasa/<task>_target_human``) is only used
|
||||
by the aggregator for logging and for the unified task table — the actual
|
||||
data is loaded from ``root``.
|
||||
"""
|
||||
repo_id = f"robocasa/{task}_target_human"
|
||||
return LeRobotDataset(repo_id=repo_id, root=root)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Aggregate the 16 RoboCasa composite_seen target tasks into one LeRobotDataset.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Local directory for the merged dataset (will be created).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hub-repo-id",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"Hub repo_id for the merged dataset (e.g. ``yourname/"
|
||||
"robocasa_composite_seen``). Required for ``--push-to-hub``; also "
|
||||
"becomes the merged dataset's canonical ``repo_id``."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push-to-hub",
|
||||
action="store_true",
|
||||
help="Push the merged dataset to the Hub after building. Requires "
|
||||
"``--hub-repo-id`` and a prior ``huggingface-cli login``.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--private",
|
||||
action="store_true",
|
||||
help="When pushing, create the Hub repo as private.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tasks",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Comma-separated task names to override the default 16 "
|
||||
"composite_seen list (useful for smoke-testing with 1–2 tasks).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-download",
|
||||
action="store_true",
|
||||
help="Skip the download step entirely; assume each task is already "
|
||||
"extracted on disk at the path ``dataset_registry.get_ds_path`` "
|
||||
"returns.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite-download",
|
||||
action="store_true",
|
||||
help="Force re-download even when a complete local extraction exists.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-level",
|
||||
type=str,
|
||||
default="INFO",
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, args.log_level),
|
||||
format="[%(levelname)s] %(message)s",
|
||||
)
|
||||
|
||||
tasks = (
|
||||
[t.strip() for t in args.tasks.split(",") if t.strip()]
|
||||
if args.tasks
|
||||
else list(COMPOSITE_SEEN_TASKS)
|
||||
)
|
||||
if not tasks:
|
||||
sys.exit("No tasks selected.")
|
||||
|
||||
if args.push_to_hub and not args.hub_repo_id:
|
||||
sys.exit("--push-to-hub requires --hub-repo-id.")
|
||||
|
||||
output_repo_id = args.hub_repo_id or "local/robocasa_composite_seen"
|
||||
logger.info(
|
||||
"Building merged RoboCasa dataset: %d tasks → %s (output dir: %s)",
|
||||
len(tasks),
|
||||
output_repo_id,
|
||||
args.output_dir,
|
||||
)
|
||||
|
||||
_require_robocasa()
|
||||
|
||||
# 1. Download (or locate) each task's extracted directory.
|
||||
task_roots: list[tuple[str, Path]] = []
|
||||
for i, task in enumerate(tasks, 1):
|
||||
logger.info("[%d/%d] %s", i, len(tasks), task)
|
||||
if args.skip_download:
|
||||
root = _resolve_task_root(task)
|
||||
if not root.exists():
|
||||
sys.exit(
|
||||
f"--skip-download set but extracted directory does not "
|
||||
f"exist for {task}: {root}"
|
||||
)
|
||||
else:
|
||||
root = _download_task(task, overwrite=args.overwrite_download)
|
||||
logger.info(" extracted at: %s", root)
|
||||
task_roots.append((task, root))
|
||||
|
||||
# 2. Open each as a LeRobotDataset (validation happens inside aggregator).
|
||||
datasets: list[LeRobotDataset] = []
|
||||
for task, root in task_roots:
|
||||
logger.info("Opening %s", task)
|
||||
ds = _open_as_lerobot_dataset(task, root)
|
||||
logger.info(
|
||||
" %s: %d episodes, %d frames, %d FPS",
|
||||
task,
|
||||
ds.num_episodes,
|
||||
ds.num_frames,
|
||||
ds.fps,
|
||||
)
|
||||
datasets.append(ds)
|
||||
|
||||
# 3. Merge — re-validates features/fps/robot_type, unifies tasks, concats
|
||||
# videos + parquet, recomputes stats.
|
||||
logger.info("Merging %d datasets into %s", len(datasets), output_repo_id)
|
||||
merged = merge_datasets(
|
||||
datasets=datasets,
|
||||
output_repo_id=output_repo_id,
|
||||
output_dir=args.output_dir,
|
||||
)
|
||||
logger.info(
|
||||
"Merged: %d episodes, %d frames across %d unique task strings",
|
||||
merged.num_episodes,
|
||||
merged.num_frames,
|
||||
len(merged.meta.tasks) if merged.meta.tasks is not None else 0,
|
||||
)
|
||||
|
||||
# 4. Push to Hub.
|
||||
if args.push_to_hub:
|
||||
logger.info("Pushing %s to the Hub (private=%s)", args.hub_repo_id, args.private)
|
||||
# ``upload_large_folder=True`` is the right mode for tens-of-GB
|
||||
# datasets — uses multipart uploads + resumable transfers.
|
||||
merged.push_to_hub(
|
||||
private=args.private,
|
||||
upload_large_folder=True,
|
||||
tags=["lerobot", "robocasa", "composite_seen", "manipulation"],
|
||||
)
|
||||
logger.info(
|
||||
"Push complete: https://huggingface.co/datasets/%s",
|
||||
args.hub_repo_id,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Skipping Hub push (no --push-to-hub). Merged dataset is at %s.",
|
||||
args.output_dir,
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user