mirror of
https://github.com/huggingface/lerobot.git
synced 2026-07-05 00:57:06 +00:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6d8ef7dc60 | |||
| ca6d764107 | |||
| 07285677a3 | |||
| 7ae12124b0 | |||
| c746ca2df2 | |||
| b961d2a8c5 |
@@ -18,7 +18,6 @@ from __future__ import annotations
|
||||
# Utilities
|
||||
########################################################################################
|
||||
import time
|
||||
from contextlib import nullcontext
|
||||
from copy import copy
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
@@ -26,6 +25,7 @@ import numpy as np
|
||||
import torch
|
||||
|
||||
from lerobot.policies import PreTrainedPolicy, prepare_observation_for_inference
|
||||
from lerobot.utils.device_utils import get_safe_autocast_context
|
||||
from lerobot.utils.import_utils import _deepdiff_available, require_package
|
||||
|
||||
if TYPE_CHECKING or _deepdiff_available:
|
||||
@@ -76,7 +76,7 @@ def predict_action(
|
||||
observation = copy(observation)
|
||||
with (
|
||||
torch.inference_mode(),
|
||||
torch.autocast(device_type=device.type) if device.type == "cuda" and use_amp else nullcontext(),
|
||||
get_safe_autocast_context(device, enabled=use_amp),
|
||||
):
|
||||
# Convert to pytorch format: channel first and float32 in [0,1] with batch dimension
|
||||
observation = prepare_observation_for_inference(observation, device, task, robot_type)
|
||||
|
||||
@@ -34,6 +34,8 @@ from .types import (
|
||||
)
|
||||
from .video import (
|
||||
DEFAULT_DEPTH_UNIT,
|
||||
DEPTH_METER_UNIT,
|
||||
DEPTH_MILLIMETER_UNIT,
|
||||
VALID_VIDEO_CODECS,
|
||||
VIDEO_ENCODER_INFO_KEYS,
|
||||
DepthEncoderConfig,
|
||||
@@ -41,6 +43,7 @@ from .video import (
|
||||
VideoEncoderConfig,
|
||||
depth_encoder_defaults,
|
||||
encoder_config_from_video_info,
|
||||
infer_depth_unit,
|
||||
rgb_encoder_defaults,
|
||||
)
|
||||
|
||||
@@ -70,8 +73,11 @@ __all__ = [
|
||||
"depth_encoder_defaults",
|
||||
# Factories
|
||||
"encoder_config_from_video_info",
|
||||
"infer_depth_unit",
|
||||
# Constants
|
||||
"DEFAULT_DEPTH_UNIT",
|
||||
"DEPTH_METER_UNIT",
|
||||
"DEPTH_MILLIMETER_UNIT",
|
||||
"VALID_VIDEO_CODECS",
|
||||
"VIDEO_ENCODER_INFO_KEYS",
|
||||
]
|
||||
|
||||
@@ -22,6 +22,8 @@ import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, ClassVar, Self
|
||||
|
||||
import numpy as np
|
||||
|
||||
from lerobot.utils.import_utils import require_package
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -36,7 +38,9 @@ HW_VIDEO_CODECS = [
|
||||
"h264_vaapi", # Linux Intel/AMD
|
||||
"h264_qsv", # Intel Quick Sync
|
||||
]
|
||||
VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS})
|
||||
VALID_VIDEO_CODECS: frozenset[str] = frozenset(
|
||||
{"h264", "hevc", "libsvtav1", "libaom-av1", "auto", *HW_VIDEO_CODECS}
|
||||
)
|
||||
# Aliases for legacy video codec names.
|
||||
VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"}
|
||||
|
||||
@@ -65,6 +69,15 @@ DEPTH_METER_UNIT: str = "m"
|
||||
DEPTH_MILLIMETER_UNIT: str = "mm"
|
||||
DEFAULT_DEPTH_UNIT: str = DEPTH_MILLIMETER_UNIT
|
||||
|
||||
|
||||
def infer_depth_unit(dtype: np.dtype | type) -> str:
|
||||
"""Infer the physical unit of raw depth frames from their dtype.
|
||||
|
||||
Floating-point frames are assumed to be in metres, integer frames in millimetres.
|
||||
"""
|
||||
return DEPTH_METER_UNIT if np.issubdtype(np.dtype(dtype), np.floating) else DEPTH_MILLIMETER_UNIT
|
||||
|
||||
|
||||
# Depth-specific tuning fields persisted under ``features[*]["info"]`` as ``video.<name>``.
|
||||
DEPTH_ENCODER_INFO_FIELD_NAMES: frozenset[str] = frozenset({"depth_min", "depth_max", "shift", "use_log"})
|
||||
|
||||
@@ -213,18 +226,24 @@ class VideoEncoderConfig:
|
||||
if encoder_threads is not None:
|
||||
svtav1_parts.append(f"lp={encoder_threads}")
|
||||
if svtav1_parts:
|
||||
opts["svtav1-params"] = ":".join(svtav1_parts)
|
||||
set_if("svtav1-params", ":".join(svtav1_parts))
|
||||
elif self.vcodec in ("h264", "hevc"):
|
||||
set_if("crf", self.crf)
|
||||
set_if("preset", self.preset)
|
||||
if self.fast_decode:
|
||||
opts["tune"] = "fastdecode"
|
||||
set_if("tune", "fastdecode")
|
||||
set_if("threads", encoder_threads)
|
||||
elif self.vcodec == "libaom-av1":
|
||||
set_if("crf", self.crf)
|
||||
set_if("preset", self.preset)
|
||||
if encoder_threads is not None:
|
||||
set_if("threads", encoder_threads)
|
||||
set_if("row-mt", 1)
|
||||
elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
|
||||
if self.crf is not None:
|
||||
opts["q:v"] = max(1, min(100, 100 - self.crf * 2))
|
||||
set_if("q:v", max(1, min(100, 100 - self.crf * 2)))
|
||||
elif self.vcodec in ("h264_nvenc", "hevc_nvenc"):
|
||||
opts["rc"] = 0
|
||||
set_if("rc", 0)
|
||||
set_if("qp", self.crf)
|
||||
set_if("preset", self.preset)
|
||||
elif self.vcodec == "h264_vaapi":
|
||||
|
||||
@@ -509,7 +509,7 @@ def compute_episode_stats(
|
||||
For 'image'/'video' features, stats are computed per channel and kept with a
|
||||
leading channel axis (e.g. shape (3, 1, 1) for RGB). RGB stats are divided by
|
||||
255 to land in [0, 1]; depth maps (features flagged with ``is_depth_map``) skip
|
||||
this rescaling and remain in their stored units.
|
||||
this rescaling and remain in their stored units (stored in ``depth_unit``).
|
||||
"""
|
||||
if quantile_list is None:
|
||||
quantile_list = DEFAULT_QUANTILES
|
||||
|
||||
@@ -26,12 +26,13 @@ import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from lerobot.configs import VideoEncoderConfig
|
||||
from lerobot.configs import DEPTH_METER_UNIT, VideoEncoderConfig
|
||||
from lerobot.utils.constants import DEFAULT_FEATURES, HF_LEROBOT_HOME, HF_LEROBOT_HUB_CACHE
|
||||
from lerobot.utils.feature_utils import _validate_feature_names
|
||||
from lerobot.utils.utils import flatten_dict
|
||||
|
||||
from .compute_stats import aggregate_stats
|
||||
from .depth_utils import MM_PER_METRE
|
||||
from .feature_utils import create_empty_dataset_info
|
||||
from .io_utils import (
|
||||
get_file_size_in_mb,
|
||||
@@ -358,6 +359,35 @@ class LeRobotDatasetMetadata:
|
||||
|
||||
return [key for key, ft in self.features.items() if _is_depth(ft)]
|
||||
|
||||
def rescale_depth_stats(self, output_unit: str) -> None:
|
||||
"""Rescale depth feature stats in place from their recorded unit to ``output_unit``.
|
||||
|
||||
Depth stats are stored in the unit the frames were recorded in
|
||||
(``features[key]["info"]["depth_unit"]``), while frames are returned in
|
||||
``output_unit`` on read. This converts the unit-bearing stat entries so
|
||||
stats match the frames consumers see.
|
||||
"""
|
||||
missing_unit_keys = [
|
||||
key for key in self.depth_keys if (self.features[key].get("info") or {}).get("depth_unit") is None
|
||||
]
|
||||
if missing_unit_keys:
|
||||
logging.warning(
|
||||
f"Depth feature(s) {missing_unit_keys} have no recorded 'depth_unit' in their info. "
|
||||
f"Depth maps and stats for these keys will be returned AS IS, with no unit conversion "
|
||||
f"to the requested output unit {output_unit!r}. Re-record the dataset or set 'depth_unit' "
|
||||
f"in the feature info (meta/info.json) to enable conversion."
|
||||
)
|
||||
if self.stats is None:
|
||||
return
|
||||
for key in self.depth_keys:
|
||||
stored_unit = (self.features[key].get("info") or {}).get("depth_unit")
|
||||
if stored_unit is None or stored_unit == output_unit or key not in self.stats:
|
||||
continue
|
||||
factor = MM_PER_METRE if stored_unit == DEPTH_METER_UNIT else 1.0 / MM_PER_METRE
|
||||
self.stats[key] = {
|
||||
stat: value if stat == "count" else value * factor for stat, value in self.stats[key].items()
|
||||
}
|
||||
|
||||
@property
|
||||
def camera_keys(self) -> list[str]:
|
||||
"""Keys to access visual modalities (regardless of their storage method)."""
|
||||
|
||||
@@ -22,10 +22,14 @@ from pathlib import Path
|
||||
import datasets
|
||||
import torch
|
||||
|
||||
from lerobot.configs import DEFAULT_DEPTH_UNIT, DepthEncoderConfig
|
||||
from lerobot.configs import (
|
||||
DEFAULT_DEPTH_UNIT,
|
||||
DEPTH_METER_UNIT,
|
||||
DepthEncoderConfig,
|
||||
)
|
||||
|
||||
from .dataset_metadata import LeRobotDatasetMetadata
|
||||
from .depth_utils import dequantize_depth
|
||||
from .depth_utils import MM_PER_METRE, dequantize_depth
|
||||
from .feature_utils import (
|
||||
check_delta_timestamps,
|
||||
get_delta_indices,
|
||||
@@ -102,6 +106,13 @@ class DatasetReader:
|
||||
for vid_key in self._meta.depth_keys
|
||||
}
|
||||
|
||||
# Get the input unit of each depth feature stored as raw images.
|
||||
self._image_depth_units: dict[str, str | None] = {
|
||||
key: (self._meta.features[key].get("info") or {}).get("depth_unit")
|
||||
for key in self._meta.depth_keys
|
||||
if key in self._meta.image_keys
|
||||
}
|
||||
|
||||
def set_image_transforms(self, image_transforms: Callable | None) -> None:
|
||||
"""Replace the transform applied to visual observations."""
|
||||
if image_transforms is not None and not callable(image_transforms):
|
||||
@@ -329,6 +340,13 @@ class DatasetReader:
|
||||
continue
|
||||
item[cam] = self._image_transforms(item[cam])
|
||||
|
||||
# Convert depth features to the output unit.
|
||||
for key, stored_unit in self._image_depth_units.items():
|
||||
if key in item and stored_unit is not None and stored_unit != self._depth_output_unit:
|
||||
item[key] = (
|
||||
item[key] * MM_PER_METRE if stored_unit == DEPTH_METER_UNIT else item[key] / MM_PER_METRE
|
||||
)
|
||||
|
||||
# Add task as a string
|
||||
task_idx = item["task_index"].item()
|
||||
item["task"] = self._meta.tasks.iloc[task_idx].name
|
||||
|
||||
@@ -36,6 +36,7 @@ from lerobot.configs import (
|
||||
RGBEncoderConfig,
|
||||
VideoEncoderConfig,
|
||||
depth_encoder_defaults,
|
||||
infer_depth_unit,
|
||||
rgb_encoder_defaults,
|
||||
)
|
||||
|
||||
@@ -209,6 +210,15 @@ class DatasetWriter:
|
||||
self.episode_buffer["timestamp"].append(timestamp)
|
||||
self.episode_buffer["task"].append(frame.pop("task"))
|
||||
|
||||
# Record each depth feature's input unit once, inferred from the first frame's dtype.
|
||||
if frame_index == 0:
|
||||
for depth_key in self._meta.depth_keys:
|
||||
if depth_key not in frame:
|
||||
continue
|
||||
info = self._meta.features[depth_key].setdefault("info", {})
|
||||
if info.get("depth_unit") is None:
|
||||
info["depth_unit"] = infer_depth_unit(np.asarray(frame[depth_key]).dtype)
|
||||
|
||||
# Start streaming encoder on first frame of episode
|
||||
if frame_index == 0 and self._streaming_encoder is not None:
|
||||
self._streaming_encoder.start_episode(
|
||||
|
||||
@@ -34,12 +34,13 @@ from lerobot.configs.video import (
|
||||
DEPTH_METER_UNIT,
|
||||
DEPTH_MILLIMETER_UNIT,
|
||||
DEPTH_QMAX,
|
||||
infer_depth_unit,
|
||||
)
|
||||
|
||||
from .image_writer import squeeze_single_channel
|
||||
from .pyav_utils import write_u16_plane
|
||||
|
||||
_MM_PER_METRE = 1000.0
|
||||
MM_PER_METRE = 1000.0
|
||||
_UINT16_MAX = 65535
|
||||
|
||||
|
||||
@@ -57,11 +58,7 @@ def _depth_input_to_float32_and_unit(
|
||||
input_unit: Literal["auto", DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT],
|
||||
) -> tuple[NDArray[np.float32], Literal[DEPTH_METER_UNIT, DEPTH_MILLIMETER_UNIT]]:
|
||||
"""Convert depth to float32 in the chosen unit, and return the resolved unit."""
|
||||
resolved_unit = (
|
||||
(DEPTH_METER_UNIT if np.issubdtype(depth.dtype, np.floating) else DEPTH_MILLIMETER_UNIT)
|
||||
if input_unit == "auto"
|
||||
else input_unit
|
||||
)
|
||||
resolved_unit = infer_depth_unit(depth.dtype) if input_unit == "auto" else input_unit
|
||||
return depth.astype(np.float32, order="K"), resolved_unit
|
||||
|
||||
|
||||
@@ -126,12 +123,12 @@ def quantize_depth(
|
||||
|
||||
# Convert depth_min, depth_max, and shift to the resolved input unit.
|
||||
depth_min_u = (
|
||||
np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * _MM_PER_METRE)
|
||||
np.float32(depth_min) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_min * MM_PER_METRE)
|
||||
)
|
||||
depth_max_u = (
|
||||
np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * _MM_PER_METRE)
|
||||
np.float32(depth_max) if resolved_unit == DEPTH_METER_UNIT else np.float32(depth_max * MM_PER_METRE)
|
||||
)
|
||||
shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * _MM_PER_METRE)
|
||||
shift_u = np.float32(shift) if resolved_unit == DEPTH_METER_UNIT else np.float32(shift * MM_PER_METRE)
|
||||
|
||||
# Normalization and quantization is performed in the resolved input unit.
|
||||
if use_log:
|
||||
@@ -236,7 +233,7 @@ def dequantize_depth(
|
||||
|
||||
# mm path: round + clamp in float32, skipping the uint16 round-trip
|
||||
# when returning a tensor (torch.uint16 is poorly supported).
|
||||
buf.mul_(_MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
|
||||
buf.mul_(MM_PER_METRE).round_().clamp_(0.0, _UINT16_MAX)
|
||||
if output_tensor:
|
||||
return buf
|
||||
return buf.cpu().numpy().astype(np.uint16, copy=False)
|
||||
@@ -259,7 +256,7 @@ def dequantize_depth(
|
||||
if output_unit == DEPTH_METER_UNIT:
|
||||
return torch.from_numpy(buf) if output_tensor else buf
|
||||
|
||||
np.multiply(buf, _MM_PER_METRE, out=buf)
|
||||
np.multiply(buf, MM_PER_METRE, out=buf)
|
||||
np.rint(buf, out=buf)
|
||||
np.clip(buf, 0.0, _UINT16_MAX, out=buf)
|
||||
if output_tensor:
|
||||
|
||||
@@ -224,6 +224,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
)
|
||||
self.root = self.meta.root
|
||||
self.revision = self.meta.revision
|
||||
self.meta.rescale_depth_stats(self._depth_output_unit)
|
||||
|
||||
if episodes is not None and any(
|
||||
episode >= self.meta.total_episodes or episode < 0 for episode in episodes
|
||||
@@ -350,6 +351,11 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
"""Frames per second used during data collection."""
|
||||
return self.meta.fps
|
||||
|
||||
@property
|
||||
def depth_output_unit(self) -> str:
|
||||
"""Physical unit (``"m"`` or ``"mm"``) depth maps and statistics are returned in on read."""
|
||||
return self._depth_output_unit
|
||||
|
||||
@property
|
||||
def num_frames(self) -> int:
|
||||
"""Number of frames in selected episodes."""
|
||||
|
||||
@@ -22,11 +22,11 @@ import numpy as np
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
|
||||
from lerobot.configs import DEFAULT_DEPTH_UNIT, DepthEncoderConfig
|
||||
from lerobot.configs import DEFAULT_DEPTH_UNIT, DEPTH_METER_UNIT, DepthEncoderConfig
|
||||
from lerobot.utils.constants import HF_LEROBOT_HOME, LOOKAHEAD_BACKTRACKTABLE, LOOKBACK_BACKTRACKTABLE
|
||||
|
||||
from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
|
||||
from .depth_utils import dequantize_depth
|
||||
from .depth_utils import MM_PER_METRE, dequantize_depth
|
||||
from .feature_utils import get_delta_indices
|
||||
from .io_utils import item_to_torch
|
||||
from .utils import (
|
||||
@@ -310,6 +310,7 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
|
||||
)
|
||||
self.root = self.meta.root
|
||||
self.revision = self.meta.revision
|
||||
self.meta.rescale_depth_stats(self._depth_output_unit)
|
||||
# Check version
|
||||
check_version_compatibility(self.repo_id, self.meta._version, CODEBASE_VERSION)
|
||||
|
||||
@@ -318,6 +319,13 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
|
||||
for vid_key in self.meta.depth_keys
|
||||
}
|
||||
|
||||
# Input unit of each depth feature stored as raw images (dequantized separately from videos).
|
||||
self._image_depth_units: dict[str, str | None] = {
|
||||
key: (self.meta.features[key].get("info") or {}).get("depth_unit")
|
||||
for key in self.meta.depth_keys
|
||||
if key in self.meta.image_keys
|
||||
}
|
||||
|
||||
self.delta_timestamps = None
|
||||
self.delta_indices = None
|
||||
|
||||
@@ -348,6 +356,11 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
|
||||
def fps(self):
|
||||
return self.meta.fps
|
||||
|
||||
@property
|
||||
def depth_output_unit(self) -> str:
|
||||
"""Physical unit (``"m"`` or ``"mm"``) depth maps are returned in on read."""
|
||||
return self._depth_output_unit
|
||||
|
||||
@staticmethod
|
||||
def _iter_random_indices(
|
||||
rng: np.random.Generator, buffer_size: int, random_batch_size=100
|
||||
@@ -530,6 +543,15 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
|
||||
for update in updates:
|
||||
result.update(update)
|
||||
|
||||
# Convert raw-image depth features to the output unit (video depth is already converted).
|
||||
for key, stored_unit in self._image_depth_units.items():
|
||||
if key in result and stored_unit is not None and stored_unit != self._depth_output_unit:
|
||||
result[key] = (
|
||||
result[key] * MM_PER_METRE
|
||||
if stored_unit == DEPTH_METER_UNIT
|
||||
else result[key] / MM_PER_METRE
|
||||
)
|
||||
|
||||
result["task"] = self.meta.tasks.iloc[item["task_index"]].name
|
||||
|
||||
yield result
|
||||
|
||||
@@ -43,6 +43,7 @@ from torch import Tensor
|
||||
|
||||
from lerobot.configs import FeatureType, PolicyFeature
|
||||
from lerobot.utils.constants import ACTION, OBS_IMAGES
|
||||
from lerobot.utils.device_utils import get_safe_autocast_context
|
||||
from lerobot.utils.import_utils import require_package
|
||||
|
||||
from ..pretrained import PreTrainedPolicy
|
||||
@@ -243,7 +244,7 @@ class GrootPolicy(PreTrainedPolicy):
|
||||
|
||||
# Run GR00T forward under bf16 autocast when enabled to reduce activation memory
|
||||
# Rationale: Matches original GR00T finetuning (bf16 compute, fp32 params) and avoids fp32 upcasts.
|
||||
with torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=self.config.use_bf16):
|
||||
with get_safe_autocast_context(device, dtype=torch.bfloat16, enabled=self.config.use_bf16):
|
||||
outputs = self._groot_model.forward(groot_inputs)
|
||||
|
||||
# Isaac-GR00T returns a BatchFeature; loss key is typically 'loss'
|
||||
@@ -275,7 +276,7 @@ class GrootPolicy(PreTrainedPolicy):
|
||||
device = next(self.parameters()).device
|
||||
|
||||
# Use bf16 autocast for inference to keep memory low and match backbone dtype
|
||||
with torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=self.config.use_bf16):
|
||||
with get_safe_autocast_context(device, dtype=torch.bfloat16, enabled=self.config.use_bf16):
|
||||
outputs = self._groot_model.get_action(groot_inputs)
|
||||
|
||||
actions = outputs.get("action_pred")
|
||||
|
||||
@@ -31,7 +31,6 @@ import logging
|
||||
import os
|
||||
import types
|
||||
from collections import deque
|
||||
from contextlib import nullcontext
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import numpy as np
|
||||
@@ -43,6 +42,7 @@ from torch.distributions import Beta
|
||||
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.utils.constants import ACTION
|
||||
from lerobot.utils.device_utils import get_safe_autocast_context
|
||||
from lerobot.utils.import_utils import _scipy_available, _transformers_available, require_package
|
||||
|
||||
from ..rtc.modeling_rtc import RTCProcessor
|
||||
@@ -1644,10 +1644,8 @@ class MolmoAct2Policy(PreTrainedPolicy):
|
||||
device=device,
|
||||
)
|
||||
action_dim = self._output_action_dim(batch)
|
||||
autocast_context = (
|
||||
torch.autocast(device_type=device.type, dtype=model_dtype)
|
||||
if device.type in {"cuda", "cpu"} and model_dtype in {torch.bfloat16, torch.float16}
|
||||
else nullcontext()
|
||||
autocast_context = get_safe_autocast_context(
|
||||
device, dtype=model_dtype, enabled=model_dtype in {torch.bfloat16, torch.float16}
|
||||
)
|
||||
with autocast_context:
|
||||
if inference_action_mode == "discrete":
|
||||
|
||||
@@ -26,6 +26,7 @@ from torch import Tensor, nn
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy, T
|
||||
from lerobot.policies.utils import populate_queues
|
||||
from lerobot.utils.constants import ACTION, OBS_STATE
|
||||
from lerobot.utils.device_utils import get_safe_autocast_context
|
||||
from lerobot.utils.import_utils import _transformers_available, require_package
|
||||
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
@@ -183,7 +184,7 @@ class VLAJEPAModel(nn.Module):
|
||||
action_idx = action_mask.nonzero(as_tuple=True)
|
||||
|
||||
device_type = next(self.parameters()).device.type
|
||||
with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
|
||||
with get_safe_autocast_context(device_type, dtype=torch.bfloat16):
|
||||
last_hidden = self._qwen_last_decoder_hidden(qwen_inputs) # [B, seq_len, H]
|
||||
b, _, h = last_hidden.shape
|
||||
embodied_action_tokens = last_hidden[embodied_idx[0], embodied_idx[1], :].view(b, -1, h)
|
||||
@@ -250,7 +251,7 @@ class VLAJEPAModel(nn.Module):
|
||||
) -> Tensor:
|
||||
"""Flow-matching action-head loss, repeated over `repeated_diffusion_steps`."""
|
||||
device_type = next(self.parameters()).device.type
|
||||
with torch.autocast(device_type=device_type, dtype=torch.float32):
|
||||
with get_safe_autocast_context(device_type, dtype=torch.float32):
|
||||
r = self.config.repeated_diffusion_steps
|
||||
horizon = self.config.chunk_size
|
||||
actions_target = actions[:, -horizon:, :].to(torch.float32).repeat(r, 1, 1)
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from contextlib import nullcontext
|
||||
from copy import copy
|
||||
|
||||
import torch
|
||||
@@ -25,6 +24,7 @@ import torch
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.policies.utils import make_robot_action, prepare_observation_for_inference
|
||||
from lerobot.processor import PolicyProcessorPipeline
|
||||
from lerobot.utils.device_utils import get_safe_autocast_context
|
||||
|
||||
from .base import InferenceEngine
|
||||
|
||||
@@ -102,11 +102,7 @@ class SyncInferenceEngine(InferenceEngine):
|
||||
# ``obs_frame`` fresh per tick via ``build_dataset_frame``, so the
|
||||
# tensor/array values are not shared with any other reader.
|
||||
observation = copy(obs_frame)
|
||||
autocast_ctx = (
|
||||
torch.autocast(device_type=self._device.type)
|
||||
if self._device.type == "cuda" and self._policy.config.use_amp
|
||||
else nullcontext()
|
||||
)
|
||||
autocast_ctx = get_safe_autocast_context(self._device, enabled=self._policy.config.use_amp)
|
||||
with torch.inference_mode(), autocast_ctx:
|
||||
observation = prepare_observation_for_inference(
|
||||
observation, self._device, self._task, self._robot_type
|
||||
|
||||
@@ -84,6 +84,7 @@ import torch
|
||||
import torch.utils.data
|
||||
import tqdm
|
||||
|
||||
from lerobot.configs import DEPTH_MILLIMETER_UNIT
|
||||
from lerobot.datasets import LeRobotDataset
|
||||
from lerobot.utils.constants import ACTION, DONE, OBS_STATE, REWARD, SUCCESS
|
||||
from lerobot.utils.utils import init_logging
|
||||
@@ -228,6 +229,9 @@ def visualize_dataset(
|
||||
|
||||
logging.info("Logging to Rerun")
|
||||
|
||||
# Depth frames and stats are dequantized to the dataset's depth_output_unit on load.
|
||||
depth_meter = 1000.0 if dataset.depth_output_unit == DEPTH_MILLIMETER_UNIT else 1.0
|
||||
|
||||
# Use the dataset's q01/q99 depth statistics for robust depth range bounds
|
||||
depth_ranges = {}
|
||||
for key in dataset.meta.depth_keys:
|
||||
@@ -254,6 +258,7 @@ def visualize_dataset(
|
||||
depth = to_hwc_float32_numpy(batch[key][i])
|
||||
depth_entity = rr.DepthImage(
|
||||
depth,
|
||||
meter=depth_meter,
|
||||
colormap=rr.components.Colormap.Viridis,
|
||||
depth_range=depth_ranges.get(key),
|
||||
)
|
||||
|
||||
@@ -56,7 +56,6 @@ import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable
|
||||
from contextlib import nullcontext
|
||||
from copy import deepcopy
|
||||
from dataclasses import asdict
|
||||
from functools import partial
|
||||
@@ -86,7 +85,7 @@ from lerobot.policies import PreTrainedPolicy, make_policy, make_pre_post_proces
|
||||
from lerobot.processor import PolicyProcessorPipeline
|
||||
from lerobot.types import PolicyAction
|
||||
from lerobot.utils.constants import ACTION, DONE, OBS_IMAGE, OBS_IMAGES, OBS_STR, REWARD
|
||||
from lerobot.utils.device_utils import get_safe_torch_device
|
||||
from lerobot.utils.device_utils import get_safe_autocast_context, get_safe_torch_device
|
||||
from lerobot.utils.import_utils import register_third_party_plugins
|
||||
from lerobot.utils.io_utils import write_video
|
||||
from lerobot.utils.random_utils import set_seed
|
||||
@@ -698,7 +697,7 @@ def eval_main(cfg: EvalPipelineConfig):
|
||||
max_episodes_rendered = 0 if cfg.eval.recording else 10
|
||||
videos_dir = None if cfg.eval.recording else Path(cfg.output_dir) / "videos"
|
||||
|
||||
with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext():
|
||||
with torch.no_grad(), get_safe_autocast_context(device, enabled=cfg.policy.use_amp):
|
||||
info = eval_policy_all(
|
||||
envs=envs,
|
||||
policy=policy,
|
||||
|
||||
@@ -211,8 +211,12 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
|
||||
# Accelerate auto-detects the device based on the available hardware and ignores the policy.device setting.
|
||||
# Force the device to be CPU when the active config's device is set to CPU (works for both policy and reward model training).
|
||||
force_cpu = cfg.trainable_config.device == "cpu"
|
||||
# Drive Accelerate's autocast from policy.dtype (bf16/fp16 activate it; float32/absent -> launcher default).
|
||||
policy_dtype = getattr(cfg.trainable_config, "dtype", None)
|
||||
mixed_precision = {"bfloat16": "bf16", "float16": "fp16", "float32": "no"}.get(policy_dtype)
|
||||
accelerator = Accelerator(
|
||||
step_scheduler_with_optimizer=False,
|
||||
mixed_precision=mixed_precision,
|
||||
kwargs_handlers=[ddp_kwargs],
|
||||
cpu=force_cpu,
|
||||
)
|
||||
|
||||
@@ -33,7 +33,12 @@ from .constants import (
|
||||
REWARD,
|
||||
)
|
||||
from .decorators import check_if_already_connected, check_if_not_connected
|
||||
from .device_utils import auto_select_torch_device, get_safe_torch_device, is_torch_device_available
|
||||
from .device_utils import (
|
||||
auto_select_torch_device,
|
||||
get_safe_autocast_context,
|
||||
get_safe_torch_device,
|
||||
is_torch_device_available,
|
||||
)
|
||||
from .errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
|
||||
from .import_utils import is_package_available, require_package
|
||||
|
||||
@@ -51,6 +56,7 @@ __all__ = [
|
||||
"REWARD",
|
||||
# Device utilities
|
||||
"auto_select_torch_device",
|
||||
"get_safe_autocast_context",
|
||||
"get_safe_torch_device",
|
||||
"is_torch_device_available",
|
||||
# Import guards
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from contextlib import AbstractContextManager, nullcontext
|
||||
|
||||
import torch
|
||||
|
||||
@@ -107,3 +108,25 @@ def is_amp_available(device: str):
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f"Unknown device '{device}.")
|
||||
|
||||
|
||||
def get_safe_autocast_context(
|
||||
device: str | torch.device,
|
||||
*,
|
||||
dtype: torch.dtype | None = None,
|
||||
enabled: bool = True,
|
||||
) -> AbstractContextManager:
|
||||
"""Return a ``torch.autocast`` context, or a no-op when AMP is unsupported on ``device``.
|
||||
|
||||
Autocast is only entered on devices that support AMP (cuda, xpu, cpu); on mps and any
|
||||
unknown device this falls back to ``nullcontext()`` so callers can request autocast
|
||||
unconditionally without breaking on unsupported backends.
|
||||
"""
|
||||
device_type = device.type if isinstance(device, torch.device) else str(device).split(":", 1)[0]
|
||||
try:
|
||||
amp_ok = is_amp_available(device_type)
|
||||
except ValueError:
|
||||
amp_ok = False
|
||||
if not enabled or not amp_ok:
|
||||
return nullcontext()
|
||||
return torch.autocast(device_type=device_type, dtype=dtype)
|
||||
|
||||
@@ -24,6 +24,7 @@ import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from lerobot.configs import DEPTH_MILLIMETER_UNIT, infer_depth_unit
|
||||
from lerobot.types import RobotAction, RobotObservation
|
||||
|
||||
from .constants import ACTION, ACTION_PREFIX, OBS_PREFIX, OBS_STR
|
||||
@@ -161,7 +162,13 @@ def log_rerun_data(
|
||||
observation_paths.add(key)
|
||||
else:
|
||||
if arr.shape[-1] == 1:
|
||||
img_entity = rr.DepthImage(arr, colormap=rr.components.Colormap.Viridis)
|
||||
# At record time, the depth unit is inferred from the frame type.
|
||||
depth_unit = infer_depth_unit(arr.dtype)
|
||||
img_entity = rr.DepthImage(
|
||||
arr,
|
||||
meter=1000.0 if depth_unit == DEPTH_MILLIMETER_UNIT else 1.0,
|
||||
colormap=rr.components.Colormap.Viridis,
|
||||
)
|
||||
else:
|
||||
img_entity = rr.Image(arr).compress() if compress_images else rr.Image(arr)
|
||||
rr.log(key, entity=img_entity, static=True)
|
||||
|
||||
@@ -1531,6 +1531,7 @@ def test_valid_video_codecs_constant():
|
||||
assert "h264" in VALID_VIDEO_CODECS
|
||||
assert "hevc" in VALID_VIDEO_CODECS
|
||||
assert "libsvtav1" in VALID_VIDEO_CODECS
|
||||
assert "libaom-av1" in VALID_VIDEO_CODECS
|
||||
assert "auto" in VALID_VIDEO_CODECS
|
||||
assert "h264_videotoolbox" in VALID_VIDEO_CODECS
|
||||
assert "h264_nvenc" in VALID_VIDEO_CODECS
|
||||
@@ -1538,7 +1539,7 @@ def test_valid_video_codecs_constant():
|
||||
assert "h264_qsv" in VALID_VIDEO_CODECS
|
||||
assert "hevc_videotoolbox" in VALID_VIDEO_CODECS
|
||||
assert "hevc_nvenc" in VALID_VIDEO_CODECS
|
||||
assert len(VALID_VIDEO_CODECS) == 10
|
||||
assert len(VALID_VIDEO_CODECS) == 11
|
||||
|
||||
|
||||
def test_delta_timestamps_with_episodes_filter(tmp_path, empty_lerobot_dataset_factory):
|
||||
|
||||
@@ -32,6 +32,7 @@ from lerobot.configs.video import (
|
||||
)
|
||||
from lerobot.datasets.depth_utils import dequantize_depth, quantize_depth
|
||||
from lerobot.datasets.image_writer import image_array_to_pil_image, write_image
|
||||
from lerobot.utils.constants import DEFAULT_FEATURES
|
||||
from tests.fixtures.constants import (
|
||||
DEFAULT_FPS,
|
||||
DUMMY_CAMERA_FEATURES,
|
||||
@@ -245,3 +246,91 @@ class TestFeatureFileRouting:
|
||||
|
||||
dataset.save_episode()
|
||||
dataset.finalize()
|
||||
|
||||
|
||||
class TestDepthUnitMetadata:
|
||||
"""The depth unit is inferred once from dtype, stored in ``info``, and drives stats + reads."""
|
||||
|
||||
NUM_FRAMES = 4
|
||||
|
||||
def _record(self, root, features_factory, depth_dtype, value, use_videos):
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
|
||||
features = features_factory(camera_features=DUMMY_CAMERA_FEATURES_WITH_DEPTH, use_videos=use_videos)
|
||||
dataset = LeRobotDataset.create(
|
||||
repo_id=DUMMY_REPO_ID,
|
||||
fps=DEFAULT_FPS,
|
||||
features=features,
|
||||
root=root,
|
||||
use_videos=use_videos,
|
||||
streaming_encoding=use_videos,
|
||||
)
|
||||
for _ in range(self.NUM_FRAMES):
|
||||
frame: dict = {"task": "test"}
|
||||
for key, ft in dataset.meta.features.items():
|
||||
if key in DEFAULT_FEATURES:
|
||||
continue
|
||||
if key in dataset.meta.depth_keys:
|
||||
frame[key] = np.full(ft["shape"], value, dtype=depth_dtype)
|
||||
elif key in dataset.meta.camera_keys:
|
||||
frame[key] = np.random.randint(0, 256, ft["shape"], dtype=np.uint8)
|
||||
else:
|
||||
frame[key] = np.zeros(ft["shape"], dtype=np.float32)
|
||||
dataset.add_frame(frame)
|
||||
return dataset
|
||||
|
||||
@pytest.mark.parametrize("use_videos", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
("depth_dtype", "value", "expected_unit"),
|
||||
[(np.float32, 2.0, DEPTH_METER_UNIT), (np.uint16, 2000, DEPTH_MILLIMETER_UNIT)],
|
||||
)
|
||||
def test_recorded_unit_inferred_persisted_and_kept_in_stats(
|
||||
self, tmp_path, features_factory, use_videos, depth_dtype, value, expected_unit
|
||||
):
|
||||
"""Unit is inferred from the first frame's dtype, drives stats (raw, never canonicalized), and survives a reload."""
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
|
||||
dataset = self._record(tmp_path / "ds", features_factory, depth_dtype, value, use_videos)
|
||||
assert dataset.meta.features[DEPTH_KEY]["info"]["depth_unit"] == expected_unit
|
||||
dataset.save_episode()
|
||||
mean = float(np.asarray(dataset.meta.stats[DEPTH_KEY]["mean"]).reshape(-1)[0])
|
||||
np.testing.assert_allclose(mean, value, rtol=0.05)
|
||||
dataset.finalize()
|
||||
|
||||
reloaded = LeRobotDataset(repo_id=DUMMY_REPO_ID, root=tmp_path / "ds")
|
||||
assert reloaded.meta.features[DEPTH_KEY]["info"]["depth_unit"] == expected_unit
|
||||
|
||||
@pytest.mark.parametrize("use_videos", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
("output_unit", "expected"),
|
||||
[(DEPTH_MILLIMETER_UNIT, 2000.0), (DEPTH_METER_UNIT, 2.0)],
|
||||
)
|
||||
def test_read_honors_output_unit_for_frames_and_stats(
|
||||
self, tmp_path, features_factory, use_videos, output_unit, expected
|
||||
):
|
||||
"""Reloading with a ``depth_output_unit`` converts metre frames (image mode) and rescales stats while preserving count."""
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
|
||||
dataset = self._record(tmp_path / "ds", features_factory, np.float32, 2.0, use_videos=use_videos)
|
||||
dataset.save_episode()
|
||||
count = float(np.asarray(dataset.meta.stats[DEPTH_KEY]["count"]).reshape(-1)[0])
|
||||
dataset.finalize()
|
||||
|
||||
read_dataset = LeRobotDataset(
|
||||
repo_id=DUMMY_REPO_ID, root=tmp_path / "ds", depth_output_unit=output_unit
|
||||
)
|
||||
stats = read_dataset.meta.stats[DEPTH_KEY]
|
||||
np.testing.assert_allclose(float(np.asarray(stats["mean"]).reshape(-1)[0]), expected, rtol=0.05)
|
||||
np.testing.assert_allclose(float(np.asarray(stats["count"]).reshape(-1)[0]), count)
|
||||
|
||||
if not use_videos:
|
||||
depth = read_dataset[0][DEPTH_KEY]
|
||||
assert torch.allclose(depth, torch.full_like(depth, expected))
|
||||
|
||||
from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset
|
||||
|
||||
stream_dataset = StreamingLeRobotDataset(
|
||||
repo_id=DUMMY_REPO_ID, root=tmp_path / "ds", depth_output_unit=output_unit
|
||||
)
|
||||
stream_depth = next(iter(stream_dataset))[DEPTH_KEY]
|
||||
assert torch.allclose(stream_depth, torch.full_like(stream_depth, expected))
|
||||
|
||||
@@ -345,7 +345,9 @@ class TestExtraOptions:
|
||||
opts = cfg.get_codec_options()
|
||||
assert opts["qp"] == 20
|
||||
assert isinstance(opts["qp"], int)
|
||||
assert cfg.get_codec_options(as_strings=True)["qp"] == "20"
|
||||
str_opts = cfg.get_codec_options(as_strings=True)
|
||||
assert str_opts["qp"] == "20"
|
||||
assert all(isinstance(v, str) for v in str_opts.values())
|
||||
|
||||
@require_libsvtav1
|
||||
def test_structured_fields_win_on_collision(self):
|
||||
|
||||
Vendored
+8
@@ -26,6 +26,7 @@ import pytest
|
||||
import torch
|
||||
from datasets import Dataset
|
||||
|
||||
from lerobot.configs.video import infer_depth_unit
|
||||
from lerobot.datasets.dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
|
||||
from lerobot.datasets.feature_utils import get_hf_features_from_features
|
||||
from lerobot.datasets.io_utils import flatten_dict, hf_transform_to_torch
|
||||
@@ -535,6 +536,13 @@ def lerobot_dataset_factory(
|
||||
chunks_size=chunks_size,
|
||||
**info_kwargs,
|
||||
)
|
||||
# This synthetic path skips add_frame, so record the depth unit the writer would
|
||||
# have stored (dummy depth is uint16) to keep ``depth_unit`` present in info.json.
|
||||
# Reassign a fresh info dict to avoid mutating the shared feature constants.
|
||||
for ft in info.features.values():
|
||||
ft_info = ft.get("info")
|
||||
if ft_info is not None and ft_info.get("is_depth_map") and "depth_unit" not in ft_info:
|
||||
ft["info"] = {**ft_info, "depth_unit": infer_depth_unit(np.uint16)}
|
||||
if stats is None:
|
||||
stats = stats_factory(features=info.features)
|
||||
if tasks is None:
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from contextlib import nullcontext
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from lerobot.utils.device_utils import get_safe_autocast_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("device", "enabled", "expect_autocast"),
|
||||
[
|
||||
("cpu", True, True), # AMP-capable device -> real autocast
|
||||
(torch.device("cpu"), True, True), # accepts torch.device
|
||||
("cpu", False, False), # explicitly disabled -> no-op
|
||||
("mps", True, False), # AMP unsupported on mps -> no-op
|
||||
("privateuseone", True, False), # unknown device -> safe no-op
|
||||
],
|
||||
)
|
||||
def test_get_safe_autocast_context(device, enabled, expect_autocast):
|
||||
ctx = get_safe_autocast_context(device, dtype=torch.bfloat16, enabled=enabled)
|
||||
if expect_autocast:
|
||||
assert isinstance(ctx, torch.autocast)
|
||||
with ctx:
|
||||
assert torch.is_autocast_enabled("cpu")
|
||||
else:
|
||||
assert isinstance(ctx, nullcontext)
|
||||
@@ -50,8 +50,9 @@ def mock_rerun(monkeypatch):
|
||||
return self
|
||||
|
||||
class DummyDepthImage:
|
||||
def __init__(self, arr, colormap=None):
|
||||
def __init__(self, arr, meter=None, colormap=None):
|
||||
self.arr = arr
|
||||
self.meter = meter
|
||||
self.colormap = colormap
|
||||
|
||||
def dummy_log(key, obj=None, **kwargs):
|
||||
|
||||
Reference in New Issue
Block a user