Compare commits

..

2 Commits

Author SHA1 Message Date
Maxime Ellerbach a0b224e48d adding lerobot-train requirement inside PR checklist 2026-07-01 14:42:13 +00:00
Maxime Ellerbach 8ea0c4c9cf chore(agents): adding additional infos to AGENTS.md 2026-07-01 14:40:56 +00:00
9 changed files with 24 additions and 87 deletions
+2 -1
View File
@@ -51,6 +51,7 @@ pre-commit run --all-files # Lint + format (ruff, typo
## Notes
- **Mypy is gradual**: strict only for `lerobot.envs`, `lerobot.configs`, `lerobot.optim`, `lerobot.model`, `lerobot.cameras`, `lerobot.motors`, `lerobot.transport`. Add type annotations when modifying these modules.
- **Optional dependencies**: many policies, envs, and robots are behind extras (e.g., `lerobot[aloha]`). New imports for optional packages must be guarded or lazy. See `pyproject.toml [project.optional-dependencies]`.
- **Imports**: prefer top-level imports; relative (`from .sibling import X`) across sibling files within a module, absolute (`from lerobot.module import X`) across modules.
- **Optional dependencies**: many policies, envs, and robots are behind extras (e.g., `lerobot[aloha]`, see `pyproject.toml`). Guard optional imports with `TYPE_CHECKING or _foo_available` at module top + a `require_package(...)` check at use time. Reuse the `_foo_available` flags in `utils/import_utils.py`; don't call `is_package_available`.
- **Video decoding**: datasets can store observations as video files. `LeRobotDataset` handles frame extraction, but tests need ffmpeg installed.
- **Prioritize use of `uv run`** to execute Python commands (not raw `python` or `pip`).
+3
View File
@@ -165,6 +165,8 @@ Batches are flat dictionaries keyed by the constants in [`lerobot.utils.constant
LeRobot uses `PolicyProcessorPipeline`s to normalize inputs and de-normalize outputs around your policy. For a concrete reference, see [`processor_act.py`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/act/processor_act.py) or [`processor_diffusion.py`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/diffusion/processor_diffusion.py).
Pay close attention here: processors are the most common reproducibility pain point. A mismatch in normalization mode (`IDENTITY` vs `MEAN_STD` vs `MIN_MAX` vs `QUANTILES`/`QUANTILE10`) or in which features get normalized will train and eval without erroring, yet silently wreck results. Make sure the modes match how the checkpoint was trained, that the required stats exist (e.g. `QUANTILES` needs `q01`/`q99`), and that the pre- and post-processors stay consistent.
```python
# processor_my_policy.py
from typing import Any
@@ -371,6 +373,7 @@ The general expectations are in [`CONTRIBUTING.md`](https://github.com/huggingfa
- [ ] Optional deps live behind a `[project.optional-dependencies]` extra and the `TYPE_CHECKING + require_package` guard.
- [ ] `tests/policies/` updated; backward-compat artifact committed & policy-specific tests.
- [ ] `src/lerobot/policies/<name>/README.md` symlinked into `docs/source/policy_<name>_README.md`; user-facing `docs/source/<name>.mdx` written and added to `_toctree.yml`.
- [ ] `lerobot-train --policy.type my_policy ...` runs end-to-end for at least a few steps + save a checkpoint that can be loaded and run by `lerobot-eval` or `lerobot-rollout`.
- [ ] At least one reproducible benchmark eval in the policy MDX with a published checkpoint (sim benchmark, or real-robot dataset + checkpoint).
The fastest way to get a clean PR is to copy the directory of the existing policy closest to yours, rename, and replace contents method by method. Don't wait until everything is polished — open a draft PR early and iterate with us; reviewers would much rather give feedback on a half-finished branch than a fully-merged one.
@@ -134,9 +134,6 @@ lerobot-train \
> [!TIP]
> This is purely a decode-time presentation choice — it does **not** alter the stored video or its metadata, so the same dataset can be read as `mm` or `m` without re-encoding. It has no effect on datasets without depth cameras.
> [!IMPORTANT]
> Depth statistics in `meta/stats.json` are always computed in **millimetres**, regardless of the raw frame dtype.
---
## Persistence in dataset metadata
+5 -11
View File
@@ -22,7 +22,6 @@ import numpy as np
from lerobot.processor import RelativeActionsProcessorStep
from lerobot.utils.constants import ACTION, OBS_STATE
from .depth_utils import MM_PER_METRE
from .io_utils import load_image_as_numpy
DEFAULT_QUANTILES = [0.01, 0.10, 0.50, 0.90, 0.99]
@@ -509,8 +508,8 @@ def compute_episode_stats(
Note:
For 'image'/'video' features, stats are computed per channel and kept with a
leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by
255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) are
instead canonicalized to millimetres regardless of the raw frame unit.
255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip
this rescaling and remain in their stored units.
"""
if quantile_list is None:
quantile_list = DEFAULT_QUANTILES
@@ -534,14 +533,9 @@ def compute_episode_stats(
)
if features[key]["dtype"] in ["image", "video"]:
if (features[key].get("info") or {}).get("is_depth_map", False):
# Depth stats are canonically stored in millimetres; metre (float) depth is
# scaled up, integer (millimetre) depth is left as-is.
normalization_factor = (
1.0 / MM_PER_METRE if np.issubdtype(ep_ft_array.dtype, np.floating) else 1.0
)
else:
normalization_factor = 255.0
normalization_factor = (
255.0 if not (features[key].get("info") or {}).get("is_depth_map", False) else 1.0
)
ep_stats[key] = {
k: v if k == "count" else np.squeeze(v / normalization_factor, axis=0)
for k, v in ep_stats[key].items()
+6 -6
View File
@@ -39,7 +39,7 @@ from lerobot.configs.video import (
from .image_writer import squeeze_single_channel
from .pyav_utils import write_u16_plane
MM_PER_METRE = 1000.0
_MM_PER_METRE = 1000.0
_UINT16_MAX = 65535
@@ -126,12 +126,12 @@ def quantize_depth(
# Convert depth_min, depth_max, and shift to the resolved input unit.
depth_min_u = (
np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * MM_PER_METRE)
np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE)
)
depth_max_u = (
np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * MM_PER_METRE)
np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE)
)
shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * MM_PER_METRE)
shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE)
# Normalization and quantization is performed in the resolved input unit.
if use_log:
@@ -236,7 +236,7 @@ def dequantize_depth(
# mm path: round + clamp in float32, skipping the uint16 round-trip
# when returning a tensor (torch.uint16 is poorly supported).
buf.mul_(MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
if output_tensor:
return buf
return buf.cpu().numpy().astype(np.uint16, copy=False)
@@ -259,7 +259,7 @@ def dequantize_depth(
if output_unit == DEPTH_METER_UNIT:
return torch.from_numpy(buf) if output_tensor else buf
np.multiply(buf, MM_PER_METRE, out=buf)
np.multiply(buf, _MM_PER_METRE, out=buf)
np.rint(buf, out=buf)
np.clip(buf, 0.0, _UINT16_MAX, out=buf)
if output_tensor:
+1 -4
View File
@@ -47,7 +47,7 @@ from lerobot.configs import (
)
from lerobot.utils.import_utils import get_safe_default_video_backend
from .depth_utils import MM_PER_METRE, quantize_depth
from .depth_utils import quantize_depth
from .pyav_utils import get_pix_fmt_channels
logger = logging.getLogger(__name__)
@@ -848,9 +848,6 @@ class _CameraEncoderThread(threading.Thread):
# Reshape CHW to (H*W, C) for per-channel stats
channels = img_downsampled.shape[0]
img_for_stats = img_downsampled.transpose(1, 2, 0).reshape(-1, channels)
# Depth stats are canonically stored in millimetres; metre (float) depth is scaled up.
if self.is_depth and np.issubdtype(frame_data.dtype, np.floating):
img_for_stats = img_for_stats * MM_PER_METRE
stats_tracker.update(img_for_stats)
frame_count += 1
-9
View File
@@ -14,23 +14,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import traceback
import draccus.wrappers.docstring as _draccus_docstring
import pytest
from lerobot.configs.types import FeatureType, PipelineFeatureType, PolicyFeature
from lerobot.utils.import_utils import is_package_available
from tests.utils import DEVICE
# On every `draccus.parse()`, draccus rebuilds each dataclass field's help text by
# re-reading and re-parsing the class source (draccus.wrappers.docstring). For a config
# as large as TrainPipelineConfig this costs ~2.5s per parse — negligible for the single
# parse a CLI does, but tests parse configs hundreds of times. The source can't change
# within a run, so memoize it for the whole test session.
_draccus_docstring.get_attribute_docstring = functools.cache(_draccus_docstring.get_attribute_docstring)
# Import fixture modules as plugins.
# Fixtures that depend on optional packages are only registered when those packages are available,
# so that tests can be collected and run even with a minimal install.
-41
View File
@@ -245,44 +245,3 @@ class TestFeatureFileRouting:
dataset.save_episode()
dataset.finalize()
# ── 5. Depth stats unit canonicalization (millimetres) ────────────────
class TestDepthStatsUnit:
"""Depth stats are always stored in millimetres, regardless of raw frame dtype."""
NUM_FRAMES = 4
@pytest.mark.parametrize("use_videos", [False, True])
def test_stats_canonicalized_to_mm(self, tmp_path, features_factory, use_videos):
"""Float (metre) and integer (millimetre) depth over the same physical range
yield identical millimetre-scale stats."""
from lerobot.datasets.lerobot_dataset import LeRobotDataset
def _record(depth_dtype, root):
features = features_factory(
camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, use_videos=use_videos
)
dataset = LeRobotDataset.create(
repo_id=DUMMY_REPO_ID,
fps=DEFAULT_FPS,
features=features,
root=root,
use_videos=use_videos,
streaming_encoding=use_videos,
)
add_frames(dataset, num_frames=self.NUM_FRAMES, depth_dtype=depth_dtype)
dataset.save_episode()
dataset.finalize()
return np.asarray(dataset.meta.stats[DEPTH_KEY]["mean"]).reshape(-1)
# add_frames ramps float depth over 0.110 m and integer depth over 10010000 mm
# (the same physical range), so canonicalized stats must match.
mean_m = _record(np.float32, tmp_path / "ds_m")
mean_mm = _record(np.uint16, tmp_path / "ds_mm")
# Float (metre) input is scaled to millimetres, not left in the single-digit metre range.
assert mean_m.item() > 50.0
np.testing.assert_allclose(mean_m, mean_mm, rtol=0.05)
+7 -12
View File
@@ -49,18 +49,16 @@ from tests.fixtures.constants import (
)
def add_frames(dataset: LeRobotDataset, num_frames: int, depth_dtype: np.dtype = np.uint16) -> None:
def add_frames(dataset: LeRobotDataset, num_frames: int) -> None:
"""Append ``num_frames`` synthetic frames to ``dataset``.
Generates per-feature payloads from ``dataset.meta``: depth ramps (``depth_dtype``,
default ``uint16`` millimetres; pass ``np.float32`` for metres) for keys in
``dataset.meta.depth_keys``, uint8 random noise for video/image keys, and float32
zeros for everything else. ``DEFAULT_FEATURES`` (timestamp, frame_index, ...) are
auto-populated by ``add_frame`` and skipped here.
Generates per-feature payloads from ``dataset.meta``: uint16 depth ramps for
keys in ``dataset.meta.depth_keys``, uint8 random noise for video/image keys,
and float32 zeros for everything else. ``DEFAULT_FEATURES`` (timestamp,
frame_index, ...) are auto-populated by ``add_frame`` and skipped here.
"""
video_keys = dataset.meta.video_keys
depth_keys = dataset.meta.depth_keys
depth_is_float = np.issubdtype(depth_dtype, np.floating)
# Smooth gradient base reused per (H, W) to keep depth frames cheap to
# encode (HEVC Main 12 hates white noise).
_depth_base_cache: dict[tuple[int, int], np.ndarray] = {}
@@ -72,14 +70,11 @@ def add_frames(dataset: LeRobotDataset, num_frames: int, depth_dtype: np.dtype =
shape = ft["shape"]
if key in depth_keys:
h, w, _ = shape
# Float depth is expressed in metres, integer depth in millimetres.
lo, hi = (0.1, 10.0) if depth_is_float else (100.0, 10_000.0)
base = _depth_base_cache.setdefault(
(h, w),
np.linspace(lo, hi, h * w, dtype=np.float32).reshape(h, w, 1),
np.linspace(100.0, 10_000.0, h * w, dtype=np.float32).reshape(h, w, 1),
)
step = (0.05 if depth_is_float else 50.0) * i
frame[key] = (base + step).clip(0, 65535).astype(depth_dtype)
frame[key] = (base + 50.0 * i).clip(0, 65535).astype(np.uint16)
elif key in video_keys:
frame[key] = np.random.randint(0, 256, shape, dtype=np.uint8)
else: