Compare commits

..

3 Commits

Author SHA1 Message Date
Steven Palma d0ae3e9481 refactor(dataset): update imports across the codebase 2026-03-15 23:09:52 -07:00
Steven Palma 26d732c8c8 refactor(dataset): modular files 2026-03-15 23:07:52 -07:00
Steven Palma c7458c67cd chore(dataset): basic house-keeping 2026-03-15 21:26:58 -07:00
9 changed files with 84 additions and 2141 deletions
+7 -11
View File
@@ -204,26 +204,22 @@ Replace `your_username/dataset_name` with your Hugging Face username and a name
Your dataset includes:
**Your Actions (2 features)**:
**Your Actions (2 things)**:
- `linear_velocity`: How much you moved forward/backward
- `angular_velocity`: How much you turned left/right
- How much you moved forward/backward
- How much you turned left/right
**Robot Observations (24 features)**:
**Robot Observations (12 things)**:
- Front camera video
- Rear camera video
- Current speed
- Battery level
- Orientation
- GPS (latitude, longitude, signal strength)
- Which way the robot is facing
- GPS location (latitude, longitude, signal strength)
- Network signal strength
- Vibration level
- Lamp state (on/off)
- Accelerometer (x, y, z)
- Gyroscope (x, y, z)
- Magnetometer (x, y, z)
- Wheel RPMs (4 wheels)
- Lamp status (on/off)
### Where Your Data Goes
@@ -1,717 +0,0 @@
"""
Action consistency analysis for imitation learning datasets.
Two parallel analyses per dataset:
1. State-based: KNN in joint-state space → action chunk variance
2. Image-based: KNN in SigLIP embedding space → action chunk variance
Comparing them reveals whether visual similarity and proprioceptive similarity
agree on where the data is inconsistent — and images are what the policy
primarily sees.
"""
import json
from pathlib import Path
import av
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from huggingface_hub import snapshot_download
from matplotlib.colors import LinearSegmentedColormap
from PIL import Image
from scipy.spatial import cKDTree
from transformers import AutoImageProcessor, AutoModel
DATASETS = [
{"repo_id": "lerobot-data-collection/level2_final_quality3", "label": "HQ curated"},
{"repo_id": "lerobot-data-collection/level12_rac_2_2026-02-08_1", "label": "Full collection"},
]
OUTPUT_DIR = Path(__file__).resolve().parent / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)
MAX_FRAMES = 100_000
K_NEIGHBORS = 50
ACTION_CHUNK_SIZE = 30
CAMERA_KEY = "observation.images.base"
ENCODER_MODEL = "google/siglip-base-patch16-224"
ENCODE_BATCH_SIZE = 512
SEED = 42
DPI = 150
CONSISTENCY_CMAP = LinearSegmentedColormap.from_list(
"consistency", ["#0a2e0a", "#1a8e1a", "#88cc22", "#ffaa22", "#ff2222"]
)
# FK chains from OpenArm bimanual URDF (same as workspace_density.py).
LEFT_CHAIN = [
((-np.pi / 2, 0, 0), (0, 0.031, 0.698), None),
((0, 0, 0), (0, 0, 0.0625), (0, 0, 1)),
((-np.pi / 2, 0, 0), (-0.0301, 0, 0.06), (-1, 0, 0)),
((0, 0, 0), (0.0301, 0, 0.06625), (0, 0, 1)),
((0, 0, 0), (0, 0.0315, 0.15375), (0, 1, 0)),
((0, 0, 0), (0, -0.0315, 0.0955), (0, 0, 1)),
((0, 0, 0), (0.0375, 0, 0.1205), (1, 0, 0)),
((0, 0, 0), (-0.0375, 0, 0), (0, -1, 0)),
((0, 0, 0), (0, 0, 0.1001), None),
((0, 0, 0), (0, 0, 0.08), None),
]
RIGHT_CHAIN = [
((np.pi / 2, 0, 0), (0, -0.031, 0.698), None),
((0, 0, 0), (0, 0, 0.0625), (0, 0, 1)),
((np.pi / 2, 0, 0), (-0.0301, 0, 0.06), (-1, 0, 0)),
((0, 0, 0), (0.0301, 0, 0.06625), (0, 0, 1)),
((0, 0, 0), (0, 0.0315, 0.15375), (0, 1, 0)),
((0, 0, 0), (0, -0.0315, 0.0955), (0, 0, 1)),
((0, 0, 0), (0.0375, 0, 0.1205), (1, 0, 0)),
((0, 0, 0), (-0.0375, 0, 0), (0, 1, 0)),
((0, 0, 0), (0, 0, 0.1001), None),
((0, 0, 0), (0, 0, 0.08), None),
]
# ── FK math ─────────────────────────────────────────────
def _rot_x(a: float) -> np.ndarray:
c, s = np.cos(a), np.sin(a)
return np.array([[1, 0, 0], [0, c, -s], [0, s, c]])
def _rot_y(a: float) -> np.ndarray:
c, s = np.cos(a), np.sin(a)
return np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]])
def _rot_z(a: float) -> np.ndarray:
c, s = np.cos(a), np.sin(a)
return np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]])
def _tf(rpy: tuple, xyz: tuple) -> np.ndarray:
r, p, y = rpy
mat = np.eye(4)
mat[:3, :3] = _rot_z(y) @ _rot_y(p) @ _rot_x(r)
mat[:3, 3] = xyz
return mat
def _batch_axis_rot(axis: tuple, angles: np.ndarray) -> np.ndarray:
n = len(angles)
ax = np.asarray(axis, dtype=np.float64)
ax = ax / np.linalg.norm(ax)
x, y, z = ax
c = np.cos(angles)
s = np.sin(angles)
t = 1 - c
rot = np.zeros((n, 4, 4))
rot[:, 0, 0] = t * x * x + c
rot[:, 0, 1] = t * x * y - s * z
rot[:, 0, 2] = t * x * z + s * y
rot[:, 1, 0] = t * x * y + s * z
rot[:, 1, 1] = t * y * y + c
rot[:, 1, 2] = t * y * z - s * x
rot[:, 2, 0] = t * x * z - s * y
rot[:, 2, 1] = t * y * z + s * x
rot[:, 2, 2] = t * z * z + c
rot[:, 3, 3] = 1.0
return rot
def batch_fk(chain: list, joint_angles: np.ndarray) -> np.ndarray:
n = joint_angles.shape[0]
tf_batch = np.tile(np.eye(4), (n, 1, 1))
qi = 0
for rpy, xyz, axis in chain:
tf_batch = tf_batch @ _tf(rpy, xyz)
if axis is not None:
rot = _batch_axis_rot(axis, joint_angles[:, qi])
tf_batch = np.einsum("nij,njk->nik", tf_batch, rot)
qi += 1
return tf_batch[:, :3, 3]
# ── Data helpers ────────────────────────────────────────
def _flatten_names(obj: object) -> list[str]:
if isinstance(obj, dict):
out: list[str] = []
for v in obj.values():
out.extend(_flatten_names(v))
return out
if isinstance(obj, (list, tuple)):
out = []
for item in obj:
if isinstance(item, (list, tuple, dict)):
out.extend(_flatten_names(item))
else:
out.append(str(item))
return out
return [str(obj)]
def _detect_and_convert(vals: np.ndarray) -> np.ndarray:
mx = np.max(np.abs(vals))
if mx > 360:
print(f" Unit detection: servo ticks (max={mx:.0f})")
return (vals - 2048) / 2048 * np.pi
if mx > 6.3:
print(f" Unit detection: degrees (max={mx:.1f})")
return np.deg2rad(vals)
print(f" Unit detection: radians (max={mx:.3f})")
return vals.astype(np.float64)
def _find_joint_indices(features: dict, state_col: str, n_dim: int) -> tuple[list[int], list[int]]:
feat = features.get("observation.state", features.get(state_col, {}))
names = _flatten_names(feat.get("names", []))
left_idx: list[int] = []
right_idx: list[int] = []
if names and len(names) == n_dim:
names_l = [n.lower() for n in names]
print(f" Feature names: {names[:4]}{names[-4:]}")
for j in range(1, 8):
for i, nm in enumerate(names_l):
if f"left_joint_{j}" in nm and i not in left_idx:
left_idx.append(i)
break
for i, nm in enumerate(names_l):
if f"right_joint_{j}" in nm and i not in right_idx:
right_idx.append(i)
break
if len(left_idx) == 7 and len(right_idx) == 7:
print(f" Matched by name: left={left_idx} right={right_idx}")
return left_idx, right_idx
if n_dim >= 16:
print(" Falling back to positional: [0:7]=left, [8:15]=right")
return list(range(7)), list(range(8, 15))
if n_dim >= 14:
print(" Falling back to positional: [0:7]=left, [7:14]=right")
return list(range(7)), list(range(7, 14))
raise RuntimeError(f"State dim {n_dim} too small for bimanual 7-DOF robot")
def download_data(repo_id: str, camera_key: str) -> Path:
print(f" Downloading {repo_id} (parquet + {camera_key} videos) …")
return Path(
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
allow_patterns=[
"meta/**",
"data/**",
f"videos/{camera_key}/**",
],
)
)
# ── Data loading ────────────────────────────────────────
def _build_action_chunks(
actions: np.ndarray, episode_ids: np.ndarray, chunk_size: int
) -> tuple[np.ndarray, np.ndarray]:
"""
For each frame, concatenate the next chunk_size actions from the same episode.
Returns (action_chunks, valid_mask).
"""
n = len(actions)
act_dim = actions.shape[1]
chunks = np.zeros((n, chunk_size * act_dim), dtype=np.float64)
valid = np.zeros(n, dtype=bool)
for i in range(n):
end = i + chunk_size
if end > n:
continue
if episode_ids[i] != episode_ids[end - 1]:
continue
chunks[i] = actions[i:end].ravel()
valid[i] = True
return chunks, valid
def load_state_action_data(local: Path, max_frames: int, chunk_size: int, rng: np.random.Generator) -> dict:
"""
Load observation.state and action, build action chunks, subsample, normalize.
Also returns the original row indices (`chosen_idx`) for video frame mapping.
"""
info = json.loads((local / "meta" / "info.json").read_text())
features = info.get("features", {})
dfs = [pd.read_parquet(pq) for pq in sorted((local / "data").glob("**/*.parquet"))]
df = pd.concat(dfs, ignore_index=True)
n_total = len(df)
print(f" Total frames: {n_total:,}")
state_col = next((c for c in df.columns if "observation.state" in c), None)
action_col = next((c for c in df.columns if c == "action"), None)
if state_col is None:
raise RuntimeError(f"No observation.state column. Available: {list(df.columns)}")
if action_col is None:
raise RuntimeError(f"No action column. Available: {list(df.columns)}")
ep_col = next((c for c in df.columns if c == "episode_index"), None)
if ep_col is None:
raise RuntimeError(f"No episode_index column. Available: {list(df.columns)}")
state_all = np.stack(df[state_col].values).astype(np.float64)
action_all = np.stack(df[action_col].values).astype(np.float64)
episode_all = df[ep_col].values.astype(np.int64)
n_dim = state_all.shape[1]
act_dim = action_all.shape[1]
print(f" State dim: {n_dim} Action dim: {act_dim} Chunk size: {chunk_size}")
print(f" Action chunk dim: {chunk_size * act_dim}")
left_idx, right_idx = _find_joint_indices(features, state_col, n_dim)
print(" Building action chunks …")
action_chunks, valid = _build_action_chunks(action_all, episode_all, chunk_size)
valid_idx = np.where(valid)[0]
print(f" Valid frames (with full action chunk): {len(valid_idx):,} / {n_total:,}")
if len(valid_idx) > max_frames:
chosen = np.sort(rng.choice(valid_idx, max_frames, replace=False))
else:
chosen = valid_idx
print(f" Using {len(chosen):,} frames")
state_raw = state_all[chosen]
action_raw = action_chunks[chosen]
episode_ids = episode_all[chosen]
state_mean = state_raw.mean(axis=0)
state_std = state_raw.std(axis=0)
state_std[state_std < 1e-8] = 1.0
state_norm = (state_raw - state_mean) / state_std
action_mean = action_raw.mean(axis=0)
action_std = action_raw.std(axis=0)
action_std[action_std < 1e-8] = 1.0
action_norm = (action_raw - action_mean) / action_std
return {
"state_raw": state_raw,
"state_norm": state_norm,
"action_raw": action_raw,
"action_norm": action_norm,
"episode_ids": episode_ids,
"episode_all": episode_all,
"left_joint_idx": left_idx,
"right_joint_idx": right_idx,
"n_total": n_total,
"chosen_idx": chosen,
"df": df,
}
# ── Video → frame extraction ──────────────────────────────
def build_video_lookup(local: Path, camera_key: str) -> dict:
"""
Build a mapping from episode_index → {video_path, fps, from_ts}.
"""
info = json.loads((local / "meta" / "info.json").read_text())
fps = info["fps"]
video_template = info.get(
"video_path",
"videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4",
)
ep_rows = []
for pq in sorted((local / "meta" / "episodes").glob("**/*.parquet")):
ep_rows.append(pd.read_parquet(pq))
ep_df = pd.concat(ep_rows, ignore_index=True)
chunk_col = f"videos/{camera_key}/chunk_index"
file_col = f"videos/{camera_key}/file_index"
ts_from = f"videos/{camera_key}/from_timestamp"
if chunk_col not in ep_df.columns:
chunk_col = f"{camera_key}/chunk_index"
file_col = f"{camera_key}/file_index"
ts_from = f"{camera_key}/from_timestamp"
lookup: dict[int, dict] = {}
for _, row in ep_df.iterrows():
ci = int(row[chunk_col])
fi = int(row[file_col])
video_rel = video_template.format(video_key=camera_key, chunk_index=ci, file_index=fi)
lookup[int(row["episode_index"])] = {
"video_path": local / video_rel,
"from_ts": float(row[ts_from]),
"fps": fps,
}
return lookup
def _decode_video_frames(video_path: str) -> list[np.ndarray]:
"""Decode all frames from a video file using PyAV. Returns list of RGB arrays."""
container = av.open(video_path)
stream = container.streams.video[0]
stream.thread_type = "AUTO"
decoded = []
for frame in container.decode(stream):
decoded.append(frame.to_ndarray(format="rgb24"))
container.close()
return decoded
def extract_frames(
chosen_idx: np.ndarray,
episode_all: np.ndarray,
video_lookup: dict,
) -> list[np.ndarray | None]:
"""
Extract RGB frames for each chosen global index using PyAV.
Returns list of (H, W, 3) RGB arrays (or None on failure).
"""
unique_eps = np.unique(episode_all)
ep_start: dict[int, int] = {}
for ep in unique_eps:
ep_start[int(ep)] = int(np.where(episode_all == ep)[0][0])
# Build jobs: (output_index, video_path, local_frame_number)
jobs: list[tuple[int, str, int]] = []
for out_i, global_i in enumerate(chosen_idx):
ep = int(episode_all[global_i])
info = video_lookup.get(ep)
if info is None:
continue
local_frame = global_i - ep_start[ep]
jobs.append((out_i, str(info["video_path"]), local_frame))
# Group by video file, decode each video once
from collections import defaultdict
video_jobs: dict[str, list[tuple[int, int]]] = defaultdict(list)
for out_i, vpath, local_frame in jobs:
video_jobs[vpath].append((out_i, local_frame))
frames: list[np.ndarray | None] = [None] * len(chosen_idx)
extracted = 0
n_videos = len(video_jobs)
for vi, (vpath, frame_requests) in enumerate(video_jobs.items()):
if not Path(vpath).exists():
continue
try:
decoded = _decode_video_frames(vpath)
except Exception as exc:
print(f" Warning: failed to decode {Path(vpath).name}: {exc}")
continue
for out_i, local_frame in frame_requests:
if 0 <= local_frame < len(decoded):
frames[out_i] = decoded[local_frame]
extracted += 1
if (vi + 1) % 50 == 0 or (vi + 1) == n_videos:
print(f" Decoded {vi + 1}/{n_videos} videos ({extracted:,} frames so far)")
del decoded
print(f" Extracted {extracted:,} / {len(chosen_idx):,} frames from video")
return frames
# ── SigLIP encoding ─────────────────────────────────────
def encode_frames_siglip(
frames: list[np.ndarray | None],
model_name: str,
batch_size: int,
device: torch.device,
) -> np.ndarray:
"""
Encode RGB frames through SigLIP vision encoder.
Returns (N, embed_dim) float32 array. Frames that are None get a zero vector.
"""
print(f" Loading SigLIP model: {model_name}")
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device).eval()
embed_dim = model.config.vision_config.hidden_size
n = len(frames)
embeddings = np.zeros((n, embed_dim), dtype=np.float32)
valid_indices = [i for i, f in enumerate(frames) if f is not None]
print(f" Encoding {len(valid_indices):,} valid frames in batches of {batch_size}")
for batch_start in range(0, len(valid_indices), batch_size):
batch_idx = valid_indices[batch_start : batch_start + batch_size]
pil_images = [Image.fromarray(frames[i]) for i in batch_idx]
inputs = processor(images=pil_images, return_tensors="pt").to(device)
with torch.no_grad():
image_features = model.get_image_features(**inputs)
image_features = torch.nn.functional.normalize(image_features, dim=-1)
embeddings[batch_idx] = image_features.cpu().numpy()
done = min(batch_start + batch_size, len(valid_indices))
if done % (batch_size * 10) == 0 or done == len(valid_indices):
print(f" {done:,} / {len(valid_indices):,} encoded")
del model, processor
torch.cuda.empty_cache()
return embeddings
# ── KNN consistency ─────────────────────────────────────
def compute_consistency(
features: np.ndarray,
action_norm: np.ndarray,
episode_ids: np.ndarray,
k: int,
label: str = "",
) -> np.ndarray:
"""
For each frame, find K nearest neighbors in feature space from other episodes.
Return per-frame action variance (mean across action dims).
"""
n = len(features)
print(f" Building KD-tree on {n:,} vectors ({label}) …")
tree = cKDTree(features)
k_query = min(k * 3, n - 1)
print(f" Querying {k_query} neighbors per frame …")
_dists, indices = tree.query(features, k=k_query + 1)
indices = indices[:, 1:]
print(f" Computing cross-episode action variance ({label}) …")
variance = np.zeros(n)
for i in range(n):
ep_i = episode_ids[i]
neighbors = indices[i]
cross_ep = neighbors[episode_ids[neighbors] != ep_i][:k]
if len(cross_ep) < 2:
variance[i] = 0.0
continue
neighbor_actions = action_norm[cross_ep]
variance[i] = np.mean(np.var(neighbor_actions, axis=0))
return variance
# ── Visualization ───────────────────────────────────────
def _style_ax(ax: plt.Axes) -> None:
ax.set_facecolor("#0d1117")
ax.tick_params(colors="#555", labelsize=8)
for spine in ax.spines.values():
spine.set_color("#333")
def _plot_histogram(ax: plt.Axes, variance: np.ndarray, title: str, color: str) -> None:
_style_ax(ax)
median_var = np.median(variance)
mean_var = np.mean(variance)
nonzero = variance[variance > 0]
if len(nonzero) > 0:
bins = np.logspace(np.log10(nonzero.min().clip(1e-6)), np.log10(nonzero.max()), 60)
ax.hist(nonzero, bins=bins, color=color, alpha=0.8, edgecolor="#222")
ax.set_xscale("log")
ax.axvline(median_var, color="#ff6600", linewidth=2, label=f"median={median_var:.3f}")
ax.axvline(mean_var, color="#ff2222", linewidth=2, linestyle="--", label=f"mean={mean_var:.3f}")
ax.set_xlabel("Action variance (log scale)", color="#888", fontsize=10)
ax.set_ylabel("Frame count", color="#888", fontsize=10)
ax.set_title(title, color="white", fontsize=11, pad=10)
ax.legend(fontsize=8, facecolor="#1a1a2e", edgecolor="#333", labelcolor="white")
def _plot_episode_curves(
ax: plt.Axes,
var_state: np.ndarray,
var_image: np.ndarray,
episode_ids: np.ndarray,
title: str,
) -> None:
_style_ax(ax)
unique_eps = np.unique(episode_ids)
ep_means_s = np.array([var_state[episode_ids == ep].mean() for ep in unique_eps])
ep_means_i = np.array([var_image[episode_ids == ep].mean() for ep in unique_eps])
sorted_s = np.sort(ep_means_s)[::-1]
sorted_i = np.sort(ep_means_i)[::-1]
ep_x = np.arange(len(unique_eps))
ax.fill_between(ep_x, sorted_s, alpha=0.2, color="#4363d8")
ax.plot(ep_x, sorted_s, color="#4363d8", linewidth=1.2, label=f"State (med={np.median(ep_means_s):.3f})")
ax.fill_between(ep_x, sorted_i, alpha=0.2, color="#e6194b")
ax.plot(ep_x, sorted_i, color="#e6194b", linewidth=1.2, label=f"Image (med={np.median(ep_means_i):.3f})")
ax.set_xlabel("Episode rank (worst → best)", color="#888", fontsize=10)
ax.set_ylabel("Mean action variance", color="#888", fontsize=10)
ax.set_title(title, color="white", fontsize=11, pad=10)
ax.legend(fontsize=8, facecolor="#1a1a2e", edgecolor="#333", labelcolor="white")
def _plot_heatmap(
ax: plt.Axes, fig: plt.Figure, tcp_xz: np.ndarray, variance: np.ndarray, title: str
) -> None:
_style_ax(ax)
order = np.argsort(variance)
pts = tcp_xz[order]
var_sorted = variance[order]
vmin = np.percentile(variance[variance > 0], 5) if np.any(variance > 0) else 0
vmax = np.percentile(variance[variance > 0], 95) if np.any(variance > 0) else 1
sc = ax.scatter(
pts[:, 0],
pts[:, 1],
c=var_sorted,
cmap=CONSISTENCY_CMAP,
s=0.5,
alpha=0.6,
vmin=vmin,
vmax=vmax,
rasterized=True,
)
ax.set_xlabel("X (m)", color="#888", fontsize=10)
ax.set_ylabel("Z (m)", color="#888", fontsize=10)
ax.set_title(title, color="white", fontsize=11, pad=10)
ax.set_aspect("equal")
cbar = fig.colorbar(sc, ax=ax, shrink=0.8, pad=0.02)
cbar.set_label("Action variance", color="white", fontsize=9)
cbar.ax.tick_params(colors="#aaa", labelsize=7)
def render(results: list[dict], out_path: Path) -> None:
"""
4-row x N-column figure:
Row 0: State-based variance histogram
Row 1: Image-based variance histogram
Row 2: Per-episode curves (both overlaid)
Row 3: Spatial heatmap (image-based variance)
"""
n_ds = len(results)
fig, axes = plt.subplots(4, n_ds, figsize=(9 * n_ds, 24), facecolor="#0d1117")
if n_ds == 1:
axes = axes[:, np.newaxis]
headline_parts = []
for col, r in enumerate(results):
label = r["label"]
var_s = r["var_state"]
var_i = r["var_image"]
tcp_xz = r["tcp_xz"]
episode_ids = r["episode_ids"]
med_s = np.median(var_s)
med_i = np.median(var_i)
headline_parts.append(f"{label}: state={med_s:.3f}, image={med_i:.3f}")
_plot_histogram(axes[0, col], var_s, f"{label}\nState-based variance (K={K_NEIGHBORS})", "#4363d8")
_plot_histogram(
axes[1, col], var_i, f"{label}\nImage-based variance (SigLIP, K={K_NEIGHBORS})", "#e6194b"
)
_plot_episode_curves(
axes[2, col],
var_s,
var_i,
episode_ids,
f"{label}\nPer-episode inconsistency ({len(np.unique(episode_ids)):,} episodes)",
)
_plot_heatmap(
axes[3, col],
fig,
tcp_xz,
var_i,
f"{label}\nImage-based variance by TCP position (XZ)",
)
fig.suptitle(
f"Action Consistency: State vs Image (chunk={ACTION_CHUNK_SIZE}, K={K_NEIGHBORS})\n"
+ " | ".join(headline_parts),
color="white",
fontsize=15,
y=0.99,
)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.savefig(out_path, dpi=DPI, bbox_inches="tight", facecolor=fig.get_facecolor())
plt.close()
print(f"\n✓ Saved: {out_path}")
# ── Main ────────────────────────────────────────────────
def main() -> None:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
rng = np.random.default_rng(SEED)
results = []
for ds in DATASETS:
repo_id, label = ds["repo_id"], ds["label"]
print(f"\n{'=' * 60}")
print(f" {label}: {repo_id}")
print(f"{'=' * 60}")
local = download_data(repo_id, CAMERA_KEY)
data = load_state_action_data(local, MAX_FRAMES, ACTION_CHUNK_SIZE, rng)
# --- State-based KNN ---
var_state = compute_consistency(
data["state_norm"], data["action_norm"], data["episode_ids"], K_NEIGHBORS, "state"
)
print(
f" State variance: median={np.median(var_state):.4f} "
f"mean={np.mean(var_state):.4f} p90={np.percentile(var_state, 90):.4f}"
)
# --- Image-based KNN ---
print("\n Preparing image embeddings …")
video_lookup = build_video_lookup(local, CAMERA_KEY)
frames = extract_frames(data["chosen_idx"], data["episode_all"], video_lookup)
embeddings = encode_frames_siglip(frames, ENCODER_MODEL, ENCODE_BATCH_SIZE, device)
del frames # free memory
var_image = compute_consistency(
embeddings, data["action_norm"], data["episode_ids"], K_NEIGHBORS, "image"
)
print(
f" Image variance: median={np.median(var_image):.4f} "
f"mean={np.mean(var_image):.4f} p90={np.percentile(var_image, 90):.4f}"
)
# FK for spatial heatmap
print(" Computing FK for spatial heatmap …")
left_raw = data["state_raw"][:, data["left_joint_idx"]]
left_rad = _detect_and_convert(left_raw)
left_tcp = batch_fk(LEFT_CHAIN, left_rad)
tcp_xz = left_tcp[:, [0, 2]]
results.append(
{
"label": label,
"var_state": var_state,
"var_image": var_image,
"episode_ids": data["episode_ids"],
"tcp_xz": tcp_xz,
"n_total": data["n_total"],
}
)
out = OUTPUT_DIR / "action_consistency_comparison.jpg"
render(results, out)
# Save worst-episodes summary (image-based, since that's the stronger signal)
worst_summary = {}
for r in results:
unique_eps = np.unique(r["episode_ids"])
ep_means = {int(ep): float(r["var_image"][r["episode_ids"] == ep].mean()) for ep in unique_eps}
ranked = sorted(ep_means.items(), key=lambda x: x[1], reverse=True)[:50]
worst_summary[r["label"]] = [{"episode": ep, "mean_variance": v} for ep, v in ranked]
worst_path = OUTPUT_DIR / "action_consistency_worst_episodes.json"
worst_path.write_text(json.dumps(worst_summary, indent=2))
print(f"✓ Saved worst episodes: {worst_path}")
if __name__ == "__main__":
main()
@@ -1,178 +0,0 @@
"""
Create a JPG grid of random frames sampled from a LeRobot video dataset.
Downloads metadata + video chunks from HuggingFace, picks random frames,
decodes them, and tiles into a single image.
"""
import json
import random
from pathlib import Path
import cv2
import numpy as np
import pandas as pd
from huggingface_hub import snapshot_download
REPO_ID = "lerobot-data-collection/level2_final_quality3"
CAMERA_KEY = "observation.images.base"
GRID_COLS = 15
GRID_ROWS = 10
THUMB_WIDTH = 160
OUTPUT_DIR = Path(__file__).resolve().parent / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)
SEED = 1
def download_metadata(repo_id: str) -> Path:
"""Download only metadata (no videos yet)."""
print(f"[1/3] Downloading metadata for {repo_id}")
return Path(
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
allow_patterns=["meta/**"],
ignore_patterns=["*.mp4"],
)
)
def load_video_info(local: Path) -> tuple[str, list[dict], int]:
"""Parse info.json and episode parquets. Returns (camera_key, episode_rows, fps)."""
info = json.loads((local / "meta" / "info.json").read_text())
fps = info["fps"]
features = info["features"]
video_keys = [k for k, v in features.items() if v.get("dtype") == "video"]
if not video_keys:
raise RuntimeError("No video keys found in dataset features")
if CAMERA_KEY is not None:
if CAMERA_KEY not in video_keys:
raise RuntimeError(f"CAMERA_KEY='{CAMERA_KEY}' not found. Available: {video_keys}")
cam = CAMERA_KEY
else:
cam = video_keys[0]
print(f" camera='{cam}' all_cams={video_keys} fps={fps}")
ep_rows = []
for pq in sorted((local / "meta" / "episodes").glob("**/*.parquet")):
ep_rows.append(pd.read_parquet(pq))
ep_df = pd.concat(ep_rows, ignore_index=True)
video_template = info.get(
"video_path",
"videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4",
)
chunk_col = f"videos/{cam}/chunk_index"
file_col = f"videos/{cam}/file_index"
ts_from = f"videos/{cam}/from_timestamp"
ts_to = f"videos/{cam}/to_timestamp"
if chunk_col not in ep_df.columns:
chunk_col = f"{cam}/chunk_index"
file_col = f"{cam}/file_index"
ts_from = f"{cam}/from_timestamp"
ts_to = f"{cam}/to_timestamp"
episodes = []
for _, row in ep_df.iterrows():
ci = int(row[chunk_col])
fi = int(row[file_col])
episodes.append(
{
"episode_index": int(row["episode_index"]),
"chunk_index": ci,
"file_index": fi,
"from_ts": float(row[ts_from]),
"to_ts": float(row[ts_to]),
"video_rel": video_template.format(video_key=cam, chunk_index=ci, file_index=fi),
}
)
return cam, episodes, fps
def pick_random_frames(episodes: list[dict], fps: int, n: int, rng: random.Random) -> list[dict]:
"""Pick n random (episode, timestamp) pairs, return sorted by video file for efficient access."""
picks = []
for _ in range(n):
ep = rng.choice(episodes)
duration = ep["to_ts"] - ep["from_ts"]
if duration <= 0:
continue
t = ep["from_ts"] + rng.random() * duration
picks.append({**ep, "seek_ts": t})
picks.sort(key=lambda p: (p["video_rel"], p["seek_ts"]))
return picks
def download_video_files(repo_id: str, local: Path, picks: list[dict]) -> None:
"""Download only the video files we need."""
needed = sorted({p["video_rel"] for p in picks})
print(f"[2/3] Downloading {len(needed)} video file(s) …")
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
local_dir=str(local),
allow_patterns=needed,
)
def extract_frame(video_path: Path, seek_ts: float) -> np.ndarray | None:
"""Decode a single frame at the given timestamp."""
cap = cv2.VideoCapture(str(video_path))
cap.set(cv2.CAP_PROP_POS_MSEC, seek_ts * 1000.0)
ret, frame = cap.read()
cap.release()
return frame if ret else None
def build_grid(frames: list[np.ndarray], cols: int, thumb_w: int) -> np.ndarray:
"""Resize frames to uniform thumbnails and tile into a grid."""
if not frames:
raise RuntimeError("No frames decoded")
h0, w0 = frames[0].shape[:2]
thumb_h = int(thumb_w * h0 / w0)
thumbs = [cv2.resize(f, (thumb_w, thumb_h), interpolation=cv2.INTER_AREA) for f in frames]
rows = []
for i in range(0, len(thumbs), cols):
row_thumbs = thumbs[i : i + cols]
while len(row_thumbs) < cols:
row_thumbs.append(np.zeros_like(row_thumbs[0]))
rows.append(np.hstack(row_thumbs))
return np.vstack(rows)
def main() -> None:
rng = random.Random(SEED)
n_frames = GRID_COLS * GRID_ROWS
local = download_metadata(REPO_ID)
cam, episodes, fps = load_video_info(local)
picks = pick_random_frames(episodes, fps, n_frames, rng)
download_video_files(REPO_ID, local, picks)
print(f"[3/3] Decoding {n_frames} frames …")
frames: list[np.ndarray] = []
for p in picks:
vp = local / p["video_rel"]
if not vp.exists():
print(f" SKIP: {p['video_rel']} not found")
continue
frame = extract_frame(vp, p["seek_ts"])
if frame is not None:
frames.append(frame)
print(f" Decoded {len(frames)}/{n_frames} frames")
grid = build_grid(frames, GRID_COLS, THUMB_WIDTH)
safe_name = REPO_ID.replace("/", "_")
out_path = OUTPUT_DIR / f"{safe_name}_grid_{GRID_COLS}x{GRID_ROWS}.jpg"
cv2.imwrite(str(out_path), grid, [cv2.IMWRITE_JPEG_QUALITY, 92])
print(f"\n✓ Saved: {out_path} ({grid.shape[1]}×{grid.shape[0]})")
if __name__ == "__main__":
main()
@@ -1,526 +0,0 @@
"""
Create MP4 videos with sarm_progress overlay for specified episodes.
Downloads datasets from HuggingFace, extracts episode video + progress data,
and draws the progress line directly on each frame (no panel, no axes).
"""
import json
import subprocess
from pathlib import Path
import cv2
import numpy as np
import pandas as pd
from huggingface_hub import snapshot_download
DATASETS = [
{"repo_id": "lerobot-data-collection/level2_final_quality3", "episode": 250},
]
CAMERA_KEY = (
"observation.images.base" # None = auto-select first camera, or set e.g. "observation.images.top"
)
OUTPUT_DIR = Path(__file__).resolve().parent / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)
# Progress line spans the full video height
GRAPH_Y_TOP_FRAC = 0.01
GRAPH_Y_BOT_FRAC = 0.99
LINE_THICKNESS = 3
SHADOW_THICKNESS = 6 # white edge thickness
REF_ALPHA = 0.45 # opacity of the 1.0 reference line
FILL_ALPHA = 0.55 # opacity of the grey fill under the line
SCORE_FONT_SCALE = 0.8
TASK_FONT_SCALE = 0.55
def download_episode(repo_id: str, episode: int) -> Path:
"""Download only the files needed for this episode."""
# We need: meta/, sarm_progress.parquet, and the relevant video/data chunks.
# We'll download meta + sarm first, then figure out chunks.
print(f"\n[1/5] Downloading metadata for {repo_id}")
local = Path(
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
allow_patterns=["meta/**", "sarm_progress.parquet"],
ignore_patterns=["*.mp4"],
)
)
return local
def load_episode_meta(local: Path, episode: int) -> dict:
"""Read info.json + episode-level parquet to get fps, video paths, timestamps."""
info = json.loads((local / "meta" / "info.json").read_text())
fps = info["fps"]
features = info["features"]
# Find video keys (keys whose dtype=="video")
video_keys = [k for k, v in features.items() if v.get("dtype") == "video"]
if not video_keys:
raise RuntimeError("No video keys found in dataset features")
if CAMERA_KEY is not None:
if CAMERA_KEY not in video_keys:
raise RuntimeError(f"CAMERA_KEY='{CAMERA_KEY}' not found. Available: {video_keys}")
first_cam = CAMERA_KEY
else:
first_cam = video_keys[0]
print(f" fps={fps} camera='{first_cam}' all_cams={video_keys}")
# Load all episode-meta parquet files and find our episode
ep_rows = []
for pq in sorted((local / "meta" / "episodes").glob("**/*.parquet")):
df = pd.read_parquet(pq)
ep_rows.append(df)
ep_df = pd.concat(ep_rows, ignore_index=True)
row = ep_df[ep_df["episode_index"] == episode]
if row.empty:
raise RuntimeError(f"Episode {episode} not found in episode metadata")
row = row.iloc[0]
# Extract video chunk/file index for first camera
# Try both dot and slash variants of the key
chunk_col = f"videos/{first_cam}/chunk_index"
file_col = f"videos/{first_cam}/file_index"
ts_col = f"videos/{first_cam}/from_timestamp"
to_col = f"videos/{first_cam}/to_timestamp"
# Some datasets use different column naming
if chunk_col not in row.index:
# Try without the 'videos/' prefix
chunk_col = f"{first_cam}/chunk_index"
file_col = f"{first_cam}/file_index"
ts_col = f"{first_cam}/from_timestamp"
to_col = f"{first_cam}/to_timestamp"
if chunk_col not in row.index:
raise RuntimeError(
f"Cannot find video metadata columns for {first_cam}.\nAvailable: {list(row.index)}"
)
chunk_idx = int(row[chunk_col])
file_idx = int(row[file_col])
from_ts = float(row[ts_col])
to_ts = float(row[to_col])
video_template = info.get(
"video_path", "videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4"
)
video_rel = video_template.format(
video_key=first_cam,
chunk_index=chunk_idx,
file_index=file_idx,
)
# Load task name for this episode
# tasks.parquet uses the task string as the row index; task_index column holds the int id
task_name = ""
try:
# Prefer the 'tasks' list directly on the episode row
if "tasks" in row.index and row["tasks"] is not None:
tasks_val = row["tasks"]
if isinstance(tasks_val, (list, tuple, np.ndarray)) and len(tasks_val) > 0:
task_name = str(tasks_val[0])
else:
task_name = str(tasks_val).strip("[]'")
else:
tasks_pq = local / "meta" / "tasks.parquet"
if tasks_pq.exists():
tasks_df = pd.read_parquet(tasks_pq)
# Row index is the task string; task_index column is the int
task_idx = int(row.get("task_index", 0)) if "task_index" in row.index else 0
match = tasks_df[tasks_df["task_index"] == task_idx]
if not match.empty:
task_name = str(match.index[0])
print(f" Task name: '{task_name}'")
except Exception as e:
print(f" WARNING: could not load task name: {e}")
return {
"fps": fps,
"first_cam": first_cam,
"video_rel": video_rel,
"chunk_index": chunk_idx,
"file_index": file_idx,
"from_ts": from_ts,
"to_ts": to_ts,
"task_name": task_name,
}
def download_video(repo_id: str, local: Path, video_rel: str) -> Path:
"""Download the specific video file if not already present."""
video_path = local / video_rel
if video_path.exists():
print(f" Video already cached: {video_path}")
return video_path
print(f"[2/5] Downloading video file {video_rel}")
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
local_dir=str(local),
allow_patterns=[video_rel],
)
if not video_path.exists():
raise RuntimeError(f"Video not found after download: {video_path}")
return video_path
def load_progress(local: Path, episode: int) -> np.ndarray | None:
"""Load sarm_progress values for this episode. Returns sorted array of (frame_index, progress)."""
pq_path = local / "sarm_progress.parquet"
if not pq_path.exists():
print(" WARNING: sarm_progress.parquet not found, trying data parquet …")
return None
df = pd.read_parquet(pq_path)
print(f" sarm_progress.parquet columns: {list(df.columns)}")
ep_df = df[df["episode_index"] == episode].copy()
if ep_df.empty:
print(f" WARNING: No sarm_progress rows for episode {episode}")
return None
ep_df = ep_df.sort_values("frame_index")
# Prefer dense, fall back to sparse
if "progress_dense" in ep_df.columns and ep_df["progress_dense"].notna().any():
prog_col = "progress_dense"
elif "progress_sparse" in ep_df.columns:
prog_col = "progress_sparse"
else:
# Last resort: any column with 'progress' in the name
prog_cols = [c for c in ep_df.columns if "progress" in c.lower()]
if not prog_cols:
return None
prog_col = prog_cols[0]
print(f" Using progress column: '{prog_col}'")
return ep_df[["frame_index", prog_col]].rename(columns={prog_col: "progress"}).values
def extract_episode_clip(video_path: Path, from_ts: float, to_ts: float, out_path: Path) -> Path:
"""Use ffmpeg to cut the episode segment from the combined video file."""
duration = to_ts - from_ts
print(f"[3/5] Extracting clip [{from_ts:.3f}s → {to_ts:.3f}s] ({duration:.2f}s) …")
cmd = [
"ffmpeg",
"-y",
"-ss",
str(from_ts),
"-i",
str(video_path),
"-t",
str(duration),
"-c:v",
"libx264",
"-preset",
"fast",
"-crf",
"18",
"-an",
str(out_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg clip extraction failed:\n{result.stderr}")
return out_path
def precompute_pixels(
progress_data: np.ndarray,
n_frames: int,
frame_w: int,
frame_h: int,
) -> np.ndarray:
"""
Map each progress sample to pixel coordinates.
Returns array of shape (N, 2) with (x, y) in pixel space.
x spans full video width; y maps progress [0,1] to graph band.
"""
frame_indices = progress_data[:, 0].astype(float)
progress_vals = np.clip(progress_data[:, 1].astype(float), 0.0, 1.0)
y_top = int(frame_h * GRAPH_Y_TOP_FRAC)
y_bot = int(frame_h * GRAPH_Y_BOT_FRAC)
graph_h = y_bot - y_top
xs = (frame_indices / (n_frames - 1) * (frame_w - 1)).astype(int)
# progress=1 → y_top, progress=0 → y_bot
ys = (y_bot - progress_vals * graph_h).astype(int)
return np.stack([xs, ys], axis=1) # (N, 2)
def progress_color(t: float) -> tuple[int, int, int]:
"""Interpolate BGR color red→green based on normalised position t in [0,1]."""
r = int(255 * (1.0 - t))
g = int(255 * t)
return (0, g, r) # BGR
def prerender_fill(
pixels: np.ndarray,
frame_w: int,
frame_h: int,
) -> np.ndarray:
"""Pre-render the full grey fill polygon under the curve as a BGRA image."""
y_bot = int(frame_h * GRAPH_Y_BOT_FRAC)
fill_img = np.zeros((frame_h, frame_w, 4), dtype=np.uint8)
poly = np.concatenate(
[
pixels,
[[pixels[-1][0], y_bot], [pixels[0][0], y_bot]],
],
axis=0,
).astype(np.int32)
cv2.fillPoly(fill_img, [poly], color=(128, 128, 128, int(255 * FILL_ALPHA)))
return fill_img
def alpha_composite(base: np.ndarray, overlay_bgra: np.ndarray, x_max: int) -> None:
"""Blend overlay onto base in-place, but only for x < x_max."""
if x_max <= 0:
return
roi_b = base[:, :x_max]
roi_o = overlay_bgra[:, :x_max]
alpha = roi_o[:, :, 3:4].astype(np.float32) / 255.0
roi_b[:] = np.clip(
roi_o[:, :, :3].astype(np.float32) * alpha + roi_b.astype(np.float32) * (1.0 - alpha),
0,
255,
).astype(np.uint8)
def draw_text_outlined(
frame: np.ndarray,
text: str,
pos: tuple[int, int],
font_scale: float,
thickness: int = 1,
) -> None:
"""Draw text with a dark outline for readability on any background."""
font = cv2.FONT_HERSHEY_SIMPLEX
cv2.putText(frame, text, pos, font, font_scale, (0, 0, 0), thickness + 2, cv2.LINE_AA)
cv2.putText(frame, text, pos, font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)
def composite_video(
clip_path: Path,
progress_data: np.ndarray,
out_path: Path,
fps: float,
frame_h: int,
frame_w: int,
task_name: str = "",
) -> Path:
"""Read clip frames, draw gradient progress line with fill + labels, export as GIF."""
n_total = int(cv2.VideoCapture(str(clip_path)).get(cv2.CAP_PROP_FRAME_COUNT))
pixels = precompute_pixels(progress_data, n_total, frame_w, frame_h)
y_ref = int(frame_h * GRAPH_Y_TOP_FRAC)
# Pre-render fill polygon (line is drawn per-frame with live color)
fill_img = prerender_fill(pixels, frame_w, frame_h)
# 1.0 reference line overlay (full width, drawn once)
ref_img = np.zeros((frame_h, frame_w, 4), dtype=np.uint8)
cv2.line(ref_img, (0, y_ref), (frame_w - 1, y_ref), (200, 200, 200, int(255 * REF_ALPHA)), 1, cv2.LINE_AA)
frame_indices = progress_data[:, 0].astype(int)
progress_vals = progress_data[:, 1].astype(float)
print(f"[4/4] Compositing {n_total} frames …")
cap = cv2.VideoCapture(str(clip_path))
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
tmp_path = out_path.parent / (out_path.stem + "_tmp.mp4")
writer = cv2.VideoWriter(str(tmp_path), fourcc, fps, (frame_w, frame_h))
fi = 0
while True:
ret, frame = cap.read()
if not ret:
break
n_drawn = int(np.searchsorted(frame_indices, fi, side="right"))
x_cur = int(pixels[min(n_drawn, len(pixels)) - 1][0]) + 1 if n_drawn > 0 else 0
# 1. reference line (full width, always)
alpha_composite(frame, ref_img, frame_w)
# 2. grey fill under curve up to current x
alpha_composite(frame, fill_img, x_cur)
# 3. progress line — single color that transitions red→green over time
if n_drawn >= 2:
t_cur = (n_drawn - 1) / max(len(progress_vals) - 1, 1)
line_col = progress_color(t_cur)
pts = pixels[:n_drawn].reshape(-1, 1, 2).astype(np.int32)
cv2.polylines(
frame,
[pts],
isClosed=False,
color=(255, 255, 255),
thickness=SHADOW_THICKNESS,
lineType=cv2.LINE_AA,
)
cv2.polylines(
frame, [pts], isClosed=False, color=line_col, thickness=LINE_THICKNESS, lineType=cv2.LINE_AA
)
# 4. score — bottom right
if n_drawn > 0:
score = float(progress_vals[min(n_drawn, len(progress_vals)) - 1])
score_text = f"{score:.2f}"
(tw, th), _ = cv2.getTextSize(score_text, cv2.FONT_HERSHEY_SIMPLEX, SCORE_FONT_SCALE, 2)
sx = frame_w - tw - 12
sy = frame_h - 12
# coloured score matching current gradient position
t_cur = (n_drawn - 1) / max(len(progress_vals) - 1, 1)
score_col = progress_color(t_cur)
cv2.putText(
frame,
score_text,
(sx, sy),
cv2.FONT_HERSHEY_SIMPLEX,
SCORE_FONT_SCALE,
(0, 0, 0),
4,
cv2.LINE_AA,
)
cv2.putText(
frame,
score_text,
(sx, sy),
cv2.FONT_HERSHEY_SIMPLEX,
SCORE_FONT_SCALE,
score_col,
2,
cv2.LINE_AA,
)
# 5. task name — top centre
if task_name:
(tw, _), _ = cv2.getTextSize(task_name, cv2.FONT_HERSHEY_SIMPLEX, TASK_FONT_SCALE, 1)
tx = max((frame_w - tw) // 2, 4)
draw_text_outlined(frame, task_name, (tx, 22), TASK_FONT_SCALE)
writer.write(frame)
fi += 1
if fi % 100 == 0:
print(f" Frame {fi}/{n_total}", end="\r")
cap.release()
writer.release()
print()
# Convert to GIF: full resolution, 12fps, 128-color diff palette (<40MB)
gif_path = out_path.with_suffix(".gif")
palette = out_path.parent / "_palette.png"
r1 = subprocess.run( # nosec B607
[
"ffmpeg",
"-y",
"-i",
str(tmp_path),
"-vf",
f"fps=10,scale={frame_w}:-1:flags=lanczos,palettegen=max_colors=128:stats_mode=diff",
"-update",
"1",
str(palette),
],
capture_output=True,
text=True,
)
if r1.returncode != 0:
print(f" WARNING: palettegen failed:\n{r1.stderr[-500:]}")
r2 = subprocess.run( # nosec B607
[
"ffmpeg",
"-y",
"-i",
str(tmp_path),
"-i",
str(palette),
"-filter_complex",
f"fps=10,scale={frame_w}:-1:flags=lanczos[v];[v][1:v]paletteuse=dither=bayer:bayer_scale=3",
str(gif_path),
],
capture_output=True,
text=True,
)
if r2.returncode != 0:
print(f" WARNING: gif encode failed:\n{r2.stderr[-500:]}")
tmp_path.unlink(missing_ok=True)
palette.unlink(missing_ok=True)
return gif_path
def process_dataset(repo_id: str, episode: int):
safe_name = repo_id.replace("/", "_")
print(f"\n{'=' * 60}")
print(f"Processing: {repo_id} | episode {episode}")
print(f"{'=' * 60}")
# 1. Download metadata
local = download_episode(repo_id, episode)
print(f" Local cache: {local}")
# 2. Read episode metadata
ep_meta = load_episode_meta(local, episode)
print(f" Episode meta: {ep_meta}")
# 3. Download video file
video_path = download_video(repo_id, local, ep_meta["video_rel"])
# 4. Extract clip
clip_path = OUTPUT_DIR / f"{safe_name}_ep{episode}_clip.mp4"
extract_episode_clip(video_path, ep_meta["from_ts"], ep_meta["to_ts"], clip_path)
# 5. Load progress data
progress_data = load_progress(local, episode)
if progress_data is None:
print(" ERROR: Could not load sarm_progress data. Skipping overlay.")
return
n_progress = len(progress_data)
print(f" Progress frames: {n_progress}")
# 6. Get clip dimensions
cap = cv2.VideoCapture(str(clip_path))
frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
actual_fps = cap.get(cv2.CAP_PROP_FPS) or ep_meta["fps"]
cap.release()
print(f" Clip: {frame_w}×{frame_h} {n_frames} frames @ {actual_fps:.1f}fps")
# 7. Composite (draw line directly on frames)
out_path = OUTPUT_DIR / f"{safe_name}_ep{episode}_progress.mp4"
final = composite_video(
clip_path,
progress_data,
out_path,
actual_fps,
frame_h,
frame_w,
task_name=ep_meta.get("task_name", ""),
)
clip_path.unlink(missing_ok=True)
print(f"\n✓ Done: {final}")
return final
if __name__ == "__main__":
results = []
for cfg in DATASETS:
try:
out = process_dataset(cfg["repo_id"], cfg["episode"])
if out:
results.append(out)
except Exception as e:
print(f"\nERROR processing {cfg['repo_id']}: {e}")
import traceback
traceback.print_exc()
print("\n" + "=" * 60)
print("Output files:")
for r in results:
print(f" {r}")
@@ -1,496 +0,0 @@
"""
Visualize end-effector workspace density and trajectory clusters for OpenArm datasets.
Downloads joint position data (no videos) from HuggingFace, computes forward
kinematics per episode, clusters trajectories with K-means, and renders
2D projections comparing dataset coverage and multimodality.
"""
import json
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from huggingface_hub import snapshot_download
from sklearn.cluster import KMeans
DATASETS = [
{"repo_id": "lerobot-data-collection/level2_final_quality3", "label": "HQ curated"},
{"repo_id": "lerobot-data-collection/level12_rac_2_2026-02-08_1", "label": "Full collection"},
]
OUTPUT_DIR = Path(__file__).resolve().parent / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)
N_CLUSTERS = 10
WAYPOINTS = 50
SEED = 42
DPI = 180
CLUSTER_COLORS = [
"#e6194b",
"#3cb44b",
"#4363d8",
"#f58231",
"#911eb4",
"#42d4f4",
"#f032e6",
"#bfef45",
"#fabed4",
"#dcbeff",
"#9a6324",
"#fffac8",
"#800000",
"#aaffc3",
"#808000",
"#ffd8b1",
"#000075",
"#a9a9a9",
]
# FK chains extracted from OpenArm bimanual URDF.
# Each entry: (rpy, xyz, revolute_axis_or_None).
LEFT_CHAIN = [
((-np.pi / 2, 0, 0), (0, 0.031, 0.698), None),
((0, 0, 0), (0, 0, 0.0625), (0, 0, 1)),
((-np.pi / 2, 0, 0), (-0.0301, 0, 0.06), (-1, 0, 0)),
((0, 0, 0), (0.0301, 0, 0.06625), (0, 0, 1)),
((0, 0, 0), (0, 0.0315, 0.15375), (0, 1, 0)),
((0, 0, 0), (0, -0.0315, 0.0955), (0, 0, 1)),
((0, 0, 0), (0.0375, 0, 0.1205), (1, 0, 0)),
((0, 0, 0), (-0.0375, 0, 0), (0, -1, 0)),
((0, 0, 0), (0, 0, 0.1001), None),
((0, 0, 0), (0, 0, 0.08), None),
]
RIGHT_CHAIN = [
((np.pi / 2, 0, 0), (0, -0.031, 0.698), None),
((0, 0, 0), (0, 0, 0.0625), (0, 0, 1)),
((np.pi / 2, 0, 0), (-0.0301, 0, 0.06), (-1, 0, 0)),
((0, 0, 0), (0.0301, 0, 0.06625), (0, 0, 1)),
((0, 0, 0), (0, 0.0315, 0.15375), (0, 1, 0)),
((0, 0, 0), (0, -0.0315, 0.0955), (0, 0, 1)),
((0, 0, 0), (0.0375, 0, 0.1205), (1, 0, 0)),
((0, 0, 0), (-0.0375, 0, 0), (0, 1, 0)),
((0, 0, 0), (0, 0, 0.1001), None),
((0, 0, 0), (0, 0, 0.08), None),
]
# ── FK math ─────────────────────────────────────────────
def _rot_x(a: float) -> np.ndarray:
c, s = np.cos(a), np.sin(a)
return np.array([[1, 0, 0], [0, c, -s], [0, s, c]])
def _rot_y(a: float) -> np.ndarray:
c, s = np.cos(a), np.sin(a)
return np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]])
def _rot_z(a: float) -> np.ndarray:
c, s = np.cos(a), np.sin(a)
return np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]])
def _tf(rpy: tuple, xyz: tuple) -> np.ndarray:
"""Build a 4x4 homogeneous transform from URDF rpy + xyz."""
r, p, y = rpy
mat = np.eye(4)
mat[:3, :3] = _rot_z(y) @ _rot_y(p) @ _rot_x(r)
mat[:3, 3] = xyz
return mat
def _batch_axis_rot(axis: tuple, angles: np.ndarray) -> np.ndarray:
"""Batched Rodrigues rotation: (n,) angles around a fixed axis → (n, 4, 4)."""
n = len(angles)
ax = np.asarray(axis, dtype=np.float64)
ax = ax / np.linalg.norm(ax)
x, y, z = ax
c = np.cos(angles)
s = np.sin(angles)
t = 1 - c
rot = np.zeros((n, 4, 4))
rot[:, 0, 0] = t * x * x + c
rot[:, 0, 1] = t * x * y - s * z
rot[:, 0, 2] = t * x * z + s * y
rot[:, 1, 0] = t * x * y + s * z
rot[:, 1, 1] = t * y * y + c
rot[:, 1, 2] = t * y * z - s * x
rot[:, 2, 0] = t * x * z - s * y
rot[:, 2, 1] = t * y * z + s * x
rot[:, 2, 2] = t * z * z + c
rot[:, 3, 3] = 1.0
return rot
def batch_fk(chain: list, joint_angles: np.ndarray) -> np.ndarray:
"""Vectorized FK: (n, 7) radians → (n, 3) TCP positions in world frame."""
n = joint_angles.shape[0]
tf_batch = np.tile(np.eye(4), (n, 1, 1))
qi = 0
for rpy, xyz, axis in chain:
tf_batch = tf_batch @ _tf(rpy, xyz)
if axis is not None:
rot = _batch_axis_rot(axis, joint_angles[:, qi])
tf_batch = np.einsum("nij,njk->nik", tf_batch, rot)
qi += 1
return tf_batch[:, :3, 3]
# ── Data loading ────────────────────────────────────────
def _flatten_names(obj: object) -> list[str]:
"""Recursively flatten a names structure (list, dict, or nested) into a flat string list."""
if isinstance(obj, dict):
out: list[str] = []
for v in obj.values():
out.extend(_flatten_names(v))
return out
if isinstance(obj, (list, tuple)):
out = []
for item in obj:
if isinstance(item, (list, tuple, dict)):
out.extend(_flatten_names(item))
else:
out.append(str(item))
return out
return [str(obj)]
def _detect_and_convert(vals: np.ndarray) -> np.ndarray:
"""Auto-detect servo ticks / degrees / radians and convert to radians."""
mx = np.max(np.abs(vals))
if mx > 360:
print(f" Unit detection: servo ticks (max={mx:.0f})")
return (vals - 2048) / 2048 * np.pi
if mx > 6.3:
print(f" Unit detection: degrees (max={mx:.1f})")
return np.deg2rad(vals)
print(f" Unit detection: radians (max={mx:.3f})")
return vals.astype(np.float64)
def _find_joint_indices(features: dict, state_col: str, n_dim: int) -> tuple[list[int], list[int]]:
"""Try to find left/right joint indices from info.json feature names."""
feat = features.get("observation.state", features.get(state_col, {}))
names = _flatten_names(feat.get("names", []))
left_idx: list[int] = []
right_idx: list[int] = []
if names and len(names) == n_dim:
names_l = [n.lower() for n in names]
print(f" Feature names: {names[:4]}{names[-4:]}")
for j in range(1, 8):
for i, nm in enumerate(names_l):
if f"left_joint_{j}" in nm and i not in left_idx:
left_idx.append(i)
break
for i, nm in enumerate(names_l):
if f"right_joint_{j}" in nm and i not in right_idx:
right_idx.append(i)
break
if len(left_idx) == 7 and len(right_idx) == 7:
print(f" Matched by name: left={left_idx} right={right_idx}")
return left_idx, right_idx
if n_dim >= 16:
print(" Falling back to positional: [0:7]=left, [8:15]=right")
return list(range(7)), list(range(8, 15))
if n_dim >= 14:
print(" Falling back to positional: [0:7]=left, [7:14]=right")
return list(range(7)), list(range(7, 14))
raise RuntimeError(f"State dim {n_dim} too small for bimanual 7-DOF robot")
def download_data(repo_id: str) -> Path:
print(f" Downloading {repo_id} (parquet only) …")
return Path(
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
allow_patterns=["meta/**", "data/**"],
ignore_patterns=["*.mp4", "videos/**"],
)
)
def resample_trajectory(traj: np.ndarray, n_waypoints: int) -> np.ndarray:
"""Resample a (F, 3) trajectory to exactly n_waypoints via linear interpolation."""
f = traj.shape[0]
if f == n_waypoints:
return traj
old_t = np.linspace(0, 1, f)
new_t = np.linspace(0, 1, n_waypoints)
return np.column_stack([np.interp(new_t, old_t, traj[:, d]) for d in range(3)])
def load_episode_trajectories(local: Path) -> list[dict]:
"""
Load per-episode joint data, compute FK, return list of trajectory dicts.
Each dict: {"left_tcp": (F,3), "right_tcp": (F,3), "episode_index": int}.
Uses all episodes in the dataset for a fair comparison.
"""
info = json.loads((local / "meta" / "info.json").read_text())
features = info.get("features", {})
dfs = [pd.read_parquet(pq) for pq in sorted((local / "data").glob("**/*.parquet"))]
df = pd.concat(dfs, ignore_index=True)
print(f" Total frames: {len(df):,}")
state_col = next((c for c in df.columns if "observation.state" in c), None)
if state_col is None:
raise RuntimeError(f"No observation.state column. Available: {list(df.columns)}")
first = df[state_col].iloc[0]
if not hasattr(first, "__len__"):
raise RuntimeError(f"observation.state is scalar ({type(first)}), expected array")
state = np.stack(df[state_col].values).astype(np.float64)
n_dim = state.shape[1]
print(f" State dim: {n_dim} max|val|: {np.max(np.abs(state)):.1f}")
left_idx, right_idx = _find_joint_indices(features, state_col, n_dim)
ep_col = next((c for c in df.columns if c == "episode_index"), None)
if ep_col is None:
raise RuntimeError(f"No episode_index column. Available: {list(df.columns)}")
episode_ids = df[ep_col].values
unique_eps = np.unique(episode_ids)
print(f" Episodes: {len(unique_eps):,}")
left_raw = state[:, left_idx]
right_raw = state[:, right_idx]
left_all = _detect_and_convert(left_raw)
right_all = _detect_and_convert(right_raw)
print(" Computing FK per episode …")
trajectories = []
for ep_id in unique_eps:
mask = episode_ids == ep_id
left_tcp = batch_fk(LEFT_CHAIN, left_all[mask])
right_tcp = batch_fk(RIGHT_CHAIN, right_all[mask])
if len(left_tcp) < 3:
continue
trajectories.append({"left_tcp": left_tcp, "right_tcp": right_tcp, "episode_index": int(ep_id)})
print(f" Valid trajectories: {len(trajectories):,}")
return trajectories
# ── Clustering ──────────────────────────────────────────
def cluster_trajectories(
trajectories: list[dict], n_clusters: int, n_waypoints: int
) -> tuple[np.ndarray, np.ndarray]:
"""
K-means on resampled trajectory features.
Combines left+right TCP into a single feature vector per episode.
Returns (labels, centroid_trajs (k, waypoints, 6), spread_per_cluster (k,) in metres).
Spread = mean per-waypoint Euclidean distance from each trajectory to its centroid.
"""
feat_vecs = []
for t in trajectories:
left_rs = resample_trajectory(t["left_tcp"], n_waypoints)
right_rs = resample_trajectory(t["right_tcp"], n_waypoints)
feat_vecs.append(np.concatenate([left_rs.ravel(), right_rs.ravel()]))
feat_matrix = np.array(feat_vecs)
k = min(n_clusters, len(feat_vecs))
km = KMeans(n_clusters=k, n_init=10, random_state=SEED)
labels = km.fit_predict(feat_matrix)
centroids_flat = km.cluster_centers_
centroid_trajs = np.zeros((k, n_waypoints, 6))
for ci in range(k):
left_flat = centroids_flat[ci, : n_waypoints * 3]
right_flat = centroids_flat[ci, n_waypoints * 3 :]
centroid_trajs[ci, :, :3] = left_flat.reshape(n_waypoints, 3)
centroid_trajs[ci, :, 3:] = right_flat.reshape(n_waypoints, 3)
# Mean per-waypoint distance to centroid (in metres) for each cluster
spread = np.zeros(k)
for ci in range(k):
members = np.where(labels == ci)[0]
if len(members) == 0:
continue
centroid_left = centroid_trajs[ci, :, :3]
centroid_right = centroid_trajs[ci, :, 3:]
dists = []
for mi in members:
t = trajectories[mi]
left_rs = resample_trajectory(t["left_tcp"], n_waypoints)
right_rs = resample_trajectory(t["right_tcp"], n_waypoints)
d_left = np.linalg.norm(left_rs - centroid_left, axis=1).mean()
d_right = np.linalg.norm(right_rs - centroid_right, axis=1).mean()
dists.append((d_left + d_right) / 2)
spread[ci] = np.mean(dists)
return labels, centroid_trajs, spread
# ── Visualization ───────────────────────────────────────
PROJ_VIEWS = [
("XZ (side)", 0, 2, "X (m)", "Z (m)"),
("XY (top)", 0, 1, "X (m)", "Y (m)"),
("YZ (front)", 1, 2, "Y (m)", "Z (m)"),
]
def render(results: list[dict], out_path: Path) -> None:
"""
2-row × 3-col grid per dataset (3 projections × 2 datasets).
Trajectory lines colored by cluster, centroid trajectories drawn thick.
"""
n_ds = len(results)
n_proj = len(PROJ_VIEWS)
fig, axes = plt.subplots(n_ds, n_proj, figsize=(7 * n_proj, 7 * n_ds), facecolor="#0d1117")
if n_ds == 1:
axes = axes[np.newaxis, :]
for row, r in enumerate(results):
trajectories = r["trajectories"]
labels = r["labels"]
centroids = r["centroids"]
k = centroids.shape[0]
cluster_sizes = np.bincount(labels, minlength=k)
size_order = np.argsort(-cluster_sizes)
pcts = cluster_sizes / len(labels) * 100
spread = r["spread"]
for col, (view_name, dim_a, dim_b, xlabel, ylabel) in enumerate(PROJ_VIEWS):
ax = axes[row, col]
ax.set_facecolor("#0d1117")
for ti, traj in enumerate(trajectories):
color = CLUSTER_COLORS[labels[ti] % len(CLUSTER_COLORS)]
for tcp_key in ("left_tcp", "right_tcp"):
pts = traj[tcp_key]
ax.plot(pts[:, dim_a], pts[:, dim_b], color=color, alpha=0.12, linewidth=0.4)
for ci in range(k):
color = CLUSTER_COLORS[ci % len(CLUSTER_COLORS)]
left_c = centroids[ci, :, :3]
right_c = centroids[ci, :, 3:]
lw = 1.5 + 2.0 * cluster_sizes[ci] / cluster_sizes.max()
for c_pts in (left_c, right_c):
ax.plot(
c_pts[:, dim_a],
c_pts[:, dim_b],
color=color,
linewidth=lw,
alpha=0.95,
zorder=10,
)
ax.plot(
c_pts[0, dim_a],
c_pts[0, dim_b],
"o",
color=color,
markersize=4,
zorder=11,
)
ax.plot(
c_pts[-1, dim_a],
c_pts[-1, dim_b],
"s",
color=color,
markersize=4,
zorder=11,
)
ax.set_xlabel(xlabel, color="#888", fontsize=9)
ax.set_ylabel(ylabel, color="#888", fontsize=9)
ax.tick_params(colors="#555", labelsize=7)
for spine in ax.spines.values():
spine.set_color("#333")
ax.set_aspect("equal")
mean_spread_cm = np.average(spread, weights=cluster_sizes) * 100
if col == 0:
ax.set_title(
f"{r['label']} ({r['n_episodes']:,} episodes, {k} clusters, "
f"avg spread {mean_spread_cm:.1f}cm)",
color="white",
fontsize=11,
pad=10,
)
else:
ax.set_title(view_name, color="#aaa", fontsize=10, pad=8)
# Cluster size + spread legend on the rightmost panel
legend_ax = axes[row, -1]
for ci in size_order:
color = CLUSTER_COLORS[ci % len(CLUSTER_COLORS)]
spread_cm = spread[ci] * 100
label = f"C{ci}: {cluster_sizes[ci]} eps ({pcts[ci]:.0f}%) ±{spread_cm:.1f}cm"
legend_ax.plot([], [], color=color, linewidth=3, label=label)
legend_ax.legend(
loc="upper right",
fontsize=7,
frameon=True,
facecolor="#1a1a2e",
edgecolor="#333",
labelcolor="white",
handlelength=1.5,
)
fig.suptitle(
"End-Effector Trajectory Clusters (FK · K-means)",
color="white",
fontsize=16,
y=0.98,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig(out_path, dpi=DPI, bbox_inches="tight", facecolor=fig.get_facecolor())
plt.close()
print(f"\n✓ Saved: {out_path}")
# ── Main ────────────────────────────────────────────────
def main() -> None:
results = []
for ds in DATASETS:
repo_id, label = ds["repo_id"], ds["label"]
print(f"\n{'=' * 60}")
print(f" {label}: {repo_id}")
print(f"{'=' * 60}")
local = download_data(repo_id)
trajectories = load_episode_trajectories(local)
labels, centroids, spread = cluster_trajectories(trajectories, N_CLUSTERS, WAYPOINTS)
cluster_sizes = np.bincount(labels, minlength=centroids.shape[0])
print(f" Cluster sizes: {sorted(cluster_sizes, reverse=True)}")
for ci in np.argsort(-cluster_sizes):
print(
f" C{ci}: {cluster_sizes[ci]} eps ({cluster_sizes[ci] / len(labels) * 100:.0f}%) "
f"spread ±{spread[ci] * 100:.1f}cm"
)
results.append(
{
"label": label,
"trajectories": trajectories,
"labels": labels,
"centroids": centroids,
"spread": spread,
"n_episodes": len(trajectories),
}
)
out = OUTPUT_DIR / "workspace_trajectory_clusters.jpg"
render(results, out)
if __name__ == "__main__":
main()
+2 -2
View File
@@ -467,8 +467,8 @@ class VQBeTHead(nn.Module):
self.vqvae_model.optimized_steps += 1
# if we updated RVQ more than `n_vqvae_training_steps` steps, we freeze the RVQ part.
if self.vqvae_model.optimized_steps >= n_vqvae_training_steps:
self.vqvae_model.discretized.fill_(True)
self.vqvae_model.vq_layer.freeze_codebook.fill_(True)
self.vqvae_model.discretized = torch.tensor(True)
self.vqvae_model.vq_layer.freeze_codebook = torch.tensor(True)
print("Finished discretizing action data!")
self.vqvae_model.eval()
for param in self.vqvae_model.vq_layer.parameters():
@@ -33,40 +33,21 @@ from .config_earthrover_mini_plus import EarthRoverMiniPlusConfig
logger = logging.getLogger(__name__)
# Action feature keys
ACTION_LINEAR_VEL = "linear_velocity"
ACTION_ANGULAR_VEL = "angular_velocity"
ACTION_LINEAR_VEL = "linear.vel"
ACTION_ANGULAR_VEL = "angular.vel"
# Observation feature keys — cameras
# Observation feature keys
OBS_FRONT = "front"
OBS_REAR = "rear"
# Observation feature keys — telemetry
OBS_SPEED = "speed"
OBS_BATTERY_LEVEL = "battery_level"
OBS_ORIENTATION = "orientation"
OBS_GPS_LATITUDE = "gps_latitude"
OBS_GPS_LONGITUDE = "gps_longitude"
OBS_GPS_SIGNAL = "gps_signal"
OBS_SIGNAL_LEVEL = "signal_level"
OBS_LINEAR_VEL = "linear.vel"
OBS_BATTERY_LEVEL = "battery.level"
OBS_ORIENTATION_DEG = "orientation.deg"
OBS_GPS_LATITUDE = "gps.latitude"
OBS_GPS_LONGITUDE = "gps.longitude"
OBS_GPS_SIGNAL = "gps.signal"
OBS_SIGNAL_LEVEL = "signal.level"
OBS_VIBRATION = "vibration"
OBS_LAMP = "lamp"
# Observation feature keys — IMU sensors
OBS_ACCELEROMETER_X = "accelerometer_x"
OBS_ACCELEROMETER_Y = "accelerometer_y"
OBS_ACCELEROMETER_Z = "accelerometer_z"
OBS_GYROSCOPE_X = "gyroscope_x"
OBS_GYROSCOPE_Y = "gyroscope_y"
OBS_GYROSCOPE_Z = "gyroscope_z"
OBS_MAGNETOMETER_X = "magnetometer_filtered_x"
OBS_MAGNETOMETER_Y = "magnetometer_filtered_y"
OBS_MAGNETOMETER_Z = "magnetometer_filtered_z"
# Observation feature keys — wheel RPMs
OBS_WHEEL_RPM_0 = "wheel_rpm_0"
OBS_WHEEL_RPM_1 = "wheel_rpm_1"
OBS_WHEEL_RPM_2 = "wheel_rpm_2"
OBS_WHEEL_RPM_3 = "wheel_rpm_3"
OBS_LAMP_STATE = "lamp.state"
class EarthRoverMiniPlus(Robot):
@@ -173,60 +154,33 @@ class EarthRoverMiniPlus(Robot):
dict: Observation features with types/shapes:
- front: (480, 640, 3) - Front camera RGB image
- rear: (480, 640, 3) - Rear camera RGB image
- speed: float - Current speed (raw SDK value)
- battery_level: float - Battery level (0-100)
- orientation: float - Robot orientation in degrees
- gps_latitude: float - GPS latitude coordinate
- gps_longitude: float - GPS longitude coordinate
- gps_signal: float - GPS signal strength (percentage)
- signal_level: float - Network signal level (0-5)
- linear.vel: float - Current speed (0-1, SDK reports only positive speeds)
- battery.level: float - Battery level (0-1, normalized from 0-100)
- orientation.deg: float - Robot orientation (0-1, normalized from raw value)
- gps.latitude: float - GPS latitude coordinate
- gps.longitude: float - GPS longitude coordinate
- gps.signal: float - GPS signal strength (0-1, normalized from percentage)
- signal.level: float - Network signal level (0-1, normalized from 0-5)
- vibration: float - Vibration sensor reading
- lamp: float - Lamp state (0=off, 1=on)
- accelerometer_x: float - Accelerometer X axis (raw SDK value)
- accelerometer_y: float - Accelerometer Y axis (raw SDK value)
- accelerometer_z: float - Accelerometer Z axis (raw SDK value)
- gyroscope_x: float - Gyroscope X axis (raw SDK value)
- gyroscope_y: float - Gyroscope Y axis (raw SDK value)
- gyroscope_z: float - Gyroscope Z axis (raw SDK value)
- magnetometer_filtered_x: float - Magnetometer X axis (raw SDK value)
- magnetometer_filtered_y: float - Magnetometer Y axis (raw SDK value)
- magnetometer_filtered_z: float - Magnetometer Z axis (raw SDK value)
- wheel_rpm_0: float - Wheel 0 RPM
- wheel_rpm_1: float - Wheel 1 RPM
- wheel_rpm_2: float - Wheel 2 RPM
- wheel_rpm_3: float - Wheel 3 RPM
- lamp.state: float - Lamp state (0=off, 1=on)
"""
return {
# Cameras (height, width, channels)
OBS_FRONT: (480, 640, 3),
OBS_REAR: (480, 640, 3),
# Telemetry
OBS_SPEED: float,
# Motion state
OBS_LINEAR_VEL: float,
# Robot state
OBS_BATTERY_LEVEL: float,
OBS_ORIENTATION: float,
OBS_ORIENTATION_DEG: float,
# GPS
OBS_GPS_LATITUDE: float,
OBS_GPS_LONGITUDE: float,
OBS_GPS_SIGNAL: float,
# Sensors
OBS_SIGNAL_LEVEL: float,
OBS_VIBRATION: float,
OBS_LAMP: float,
# IMU — accelerometer
OBS_ACCELEROMETER_X: float,
OBS_ACCELEROMETER_Y: float,
OBS_ACCELEROMETER_Z: float,
# IMU — gyroscope
OBS_GYROSCOPE_X: float,
OBS_GYROSCOPE_Y: float,
OBS_GYROSCOPE_Z: float,
# IMU — magnetometer
OBS_MAGNETOMETER_X: float,
OBS_MAGNETOMETER_Y: float,
OBS_MAGNETOMETER_Z: float,
# Wheel RPMs
OBS_WHEEL_RPM_0: float,
OBS_WHEEL_RPM_1: float,
OBS_WHEEL_RPM_2: float,
OBS_WHEEL_RPM_3: float,
OBS_LAMP_STATE: float,
}
@cached_property
@@ -235,8 +189,8 @@ class EarthRoverMiniPlus(Robot):
Returns:
dict: Action features with types:
- linear_velocity: float - Target linear velocity (-1 to 1)
- angular_velocity: float - Target angular velocity (-1 to 1)
- linear.vel: float - Target linear velocity
- angular.vel: float - Target angular velocity
"""
return {
ACTION_LINEAR_VEL: float,
@@ -247,29 +201,19 @@ class EarthRoverMiniPlus(Robot):
def get_observation(self) -> RobotObservation:
"""Get current robot observation from SDK.
Camera frames are retrieved from SDK endpoints /v2/front and /v2/rear.
Frames are decoded from base64 and converted from BGR to RGB format.
Robot telemetry is retrieved from /data endpoint.
Sensor arrays (accels, gyros, mags, rpms) each contain entries of
[values..., timestamp]; the latest reading from each array is used.
Returns:
RobotObservation: Observation containing:
- front: Front camera image (480, 640, 3) in RGB format
- rear: Rear camera image (480, 640, 3) in RGB format
- speed: float - Current speed (raw SDK value)
- battery_level: float - Battery level (0-100)
- orientation: float - Robot orientation in degrees
- gps_latitude: float - GPS latitude coordinate
- gps_longitude: float - GPS longitude coordinate
- gps_signal: float - GPS signal strength (percentage)
- signal_level: float - Network signal level (0-5)
- vibration: float - Vibration sensor reading
- lamp: float - Lamp state (0=off, 1=on)
- accelerometer_x/y/z: float - Accelerometer axes (raw SDK value)
- gyroscope_x/y/z: float - Gyroscope axes (raw SDK value)
- magnetometer_filtered_x/y/z: float - Magnetometer axes (raw SDK value)
- wheel_rpm_0/1/2/3: float - Wheel RPMs
- linear.vel: Current speed (0-1, SDK reports only positive speeds)
- battery.level: Battery level (0-1, normalized from 0-100)
- orientation.deg: Robot orientation (0-1, normalized from raw value)
- gps.latitude: GPS latitude coordinate
- gps.longitude: GPS longitude coordinate
- gps.signal: GPS signal strength (0-1, normalized from percentage)
- signal.level: Network signal level (0-1, normalized from 0-5)
- vibration: Vibration sensor reading
- lamp.state: Lamp state (0=off, 1=on)
Raises:
DeviceNotConnectedError: If robot is not connected
@@ -291,41 +235,22 @@ class EarthRoverMiniPlus(Robot):
# Get robot state from SDK
robot_data = self._get_robot_data()
# Telemetry
observation[OBS_SPEED] = float(robot_data["speed"])
observation[OBS_BATTERY_LEVEL] = float(robot_data["battery"])
observation[OBS_ORIENTATION] = float(robot_data["orientation"])
observation[OBS_GPS_LATITUDE] = float(robot_data["latitude"])
observation[OBS_GPS_LONGITUDE] = float(robot_data["longitude"])
observation[OBS_GPS_SIGNAL] = float(robot_data["gps_signal"])
observation[OBS_SIGNAL_LEVEL] = float(robot_data["signal_level"])
observation[OBS_VIBRATION] = float(robot_data["vibration"])
observation[OBS_LAMP] = float(robot_data["lamp"])
# Motion state
observation[OBS_LINEAR_VEL] = robot_data["speed"] / 100.0 # Normalize 0-100 to 0-1
# Accelerometer — latest reading from accels array [x, y, z, ts]
accel = self._latest_sensor_reading(robot_data, "accels", n_values=3)
observation[OBS_ACCELEROMETER_X] = accel[0]
observation[OBS_ACCELEROMETER_Y] = accel[1]
observation[OBS_ACCELEROMETER_Z] = accel[2]
# Robot state
observation[OBS_BATTERY_LEVEL] = robot_data["battery"] / 100.0 # Normalize 0-100 to 0-1
observation[OBS_ORIENTATION_DEG] = robot_data["orientation"] / 360.0 # Normalize to 0-1
# Gyroscope — latest reading from gyros array [x, y, z, ts]
gyro = self._latest_sensor_reading(robot_data, "gyros", n_values=3)
observation[OBS_GYROSCOPE_X] = gyro[0]
observation[OBS_GYROSCOPE_Y] = gyro[1]
observation[OBS_GYROSCOPE_Z] = gyro[2]
# GPS data
observation[OBS_GPS_LATITUDE] = robot_data["latitude"]
observation[OBS_GPS_LONGITUDE] = robot_data["longitude"]
observation[OBS_GPS_SIGNAL] = robot_data["gps_signal"] / 100.0 # Normalize percentage to 0-1
# Magnetometer — latest reading from mags array [x, y, z, ts]
mag = self._latest_sensor_reading(robot_data, "mags", n_values=3)
observation[OBS_MAGNETOMETER_X] = mag[0]
observation[OBS_MAGNETOMETER_Y] = mag[1]
observation[OBS_MAGNETOMETER_Z] = mag[2]
# Wheel RPMs — latest reading from rpms array [w0, w1, w2, w3, ts]
rpm = self._latest_sensor_reading(robot_data, "rpms", n_values=4)
observation[OBS_WHEEL_RPM_0] = rpm[0]
observation[OBS_WHEEL_RPM_1] = rpm[1]
observation[OBS_WHEEL_RPM_2] = rpm[2]
observation[OBS_WHEEL_RPM_3] = rpm[3]
# Sensors
observation[OBS_SIGNAL_LEVEL] = robot_data["signal_level"] / 5.0 # Normalize 0-5 to 0-1
observation[OBS_VIBRATION] = robot_data["vibration"]
observation[OBS_LAMP_STATE] = float(robot_data["lamp"]) # 0 or 1
return observation
@@ -335,12 +260,11 @@ class EarthRoverMiniPlus(Robot):
Args:
action: Action dict with keys:
- linear_velocity: Target linear velocity (-1 to 1)
- angular_velocity: Target angular velocity (-1 to 1)
- linear.vel: Target linear velocity (-1 to 1)
- angular.vel: Target angular velocity (-1 to 1)
Returns:
RobotAction: The action that was sent (matches action_features keys)
Raises:
DeviceNotConnectedError: If robot is not connected
@@ -348,14 +272,18 @@ class EarthRoverMiniPlus(Robot):
Actions are sent to SDK via POST /control endpoint.
SDK expects commands in range [-1, 1].
"""
# Extract action values and convert to float
linear = float(action.get(ACTION_LINEAR_VEL, 0.0))
angular = float(action.get(ACTION_ANGULAR_VEL, 0.0))
# Send command to SDK
try:
self._send_command_to_sdk(linear, angular)
except Exception as e:
logger.error(f"Error sending action: {e}")
# Return action in format matching action_features
return {
ACTION_LINEAR_VEL: linear,
ACTION_ANGULAR_VEL: angular,
@@ -466,27 +394,11 @@ class EarthRoverMiniPlus(Robot):
logger.error(f"Error decoding image: {e}")
return None
@staticmethod
def _latest_sensor_reading(robot_data: dict, key: str, n_values: int) -> list[float]:
"""Extract the latest sensor reading from an SDK sensor array.
The SDK returns sensor arrays like ``accels``, ``gyros``, ``mags``,
``rpms`` where each entry is ``[value_0, ..., value_n, timestamp]``.
This helper returns the *n_values* leading floats from the last entry,
falling back to zeros when the key is missing or the array is empty.
"""
readings = robot_data.get(key)
if readings and len(readings) > 0:
latest = readings[-1]
return [float(v) for v in latest[:n_values]]
return [0.0] * n_values
def _get_robot_data(self) -> dict:
"""Get robot telemetry data from SDK.
Returns:
dict: Robot telemetry data including battery, speed, orientation, GPS,
and sensor arrays (accels, gyros, mags, rpms):
dict: Robot telemetry data including battery, speed, orientation, GPS, etc:
- Current data (if request succeeds)
- Cached data (if request fails but cache exists)
- Default values (if request fails and no cache exists yet)
@@ -508,23 +420,19 @@ class EarthRoverMiniPlus(Robot):
# Fallback: use cache or default values
if self._last_robot_data is not None:
return self._last_robot_data
# Return dict with default values (used only on first failure before any cache exists)
return {
"speed": 0,
"battery": 0,
"orientation": 0,
"latitude": 0.0,
"longitude": 0.0,
"gps_signal": 0,
"signal_level": 0,
"vibration": 0.0,
"lamp": 0,
"accels": [],
"gyros": [],
"mags": [],
"rpms": [],
}
else:
# Return dict with default values (used only on first failure before any cache exists)
return {
"speed": 0,
"battery": 0,
"orientation": 0,
"latitude": 0.0,
"longitude": 0.0,
"gps_signal": 0,
"signal_level": 0,
"vibration": 0.0,
"lamp": 0,
}
def _send_command_to_sdk(self, linear: float, angular: float, lamp: int = 0) -> bool:
"""Send control command to SDK.
@@ -341,8 +341,8 @@ class KeyboardRoverTeleop(KeyboardTeleop):
def action_features(self) -> dict:
"""Return action format for rover (linear and angular velocities)."""
return {
"linear_velocity": float,
"angular_velocity": float,
"linear.vel": float,
"angular.vel": float,
}
@property
@@ -366,7 +366,7 @@ class KeyboardRoverTeleop(KeyboardTeleop):
Get the current action based on pressed keys.
Returns:
RobotAction with 'linear_velocity' and 'angular_velocity' keys.
RobotAction with 'linear.vel' and 'angular.vel' keys
"""
before_read_t = time.perf_counter()
@@ -427,6 +427,6 @@ class KeyboardRoverTeleop(KeyboardTeleop):
self.logs["read_pos_dt_s"] = time.perf_counter() - before_read_t
return {
"linear_velocity": linear_velocity,
"angular_velocity": angular_velocity,
"linear.vel": linear_velocity,
"angular.vel": angular_velocity,
}
-44
View File
@@ -42,8 +42,6 @@ from lerobot.policies.factory import (
make_pre_post_processors,
)
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
from lerobot.policies.vqbet.modeling_vqbet import VQBeTHead
from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE
from lerobot.utils.random_utils import seeded_context
from tests.artifacts.policies.save_policy_to_safetensors import get_policy_stats
@@ -462,45 +460,3 @@ def test_act_temporal_ensembler():
assert torch.all(offline_avg <= einops.reduce(seq_slice, "b s 1 -> b 1", "max"))
# Selected atol=1e-4 keeping in mind actions in [-1, 1] and excepting 0.01% error.
torch.testing.assert_close(online_avg, offline_avg, rtol=1e-4, atol=1e-4)
def test_vqbet_discretize_keeps_buffers_on_device():
"""Regression test: VQBeTHead.discretize() must not move registered buffers off the model device.
Previously, `self.vqvae_model.discretized = torch.tensor(True)` replaced the
registered buffer with a new CPU tensor, causing DDP to crash with:
RuntimeError: No backend type associated with device type cpu
The fix uses `.fill_(True)` to update in-place, preserving device placement.
"""
config = VQBeTConfig()
config.input_features = {
OBS_IMAGES: PolicyFeature(type=FeatureType.VISUAL, shape=(3, 96, 96)),
OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(6,)),
}
config.output_features = {
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(6,)),
}
# Tiny sizes for fast CPU/GPU execution.
config.n_vqvae_training_steps = 3
config.vqvae_n_embed = 8
config.vqvae_embedding_dim = 32
config.vqvae_enc_hidden_dim = 32
config.action_chunk_size = 2
config.crop_shape = (84, 84)
head = VQBeTHead(config).to(DEVICE)
vqvae = head.vqvae_model
dummy_actions = torch.randn(4, config.action_chunk_size, config.action_feature.shape[0], device=DEVICE)
n_steps = config.n_vqvae_training_steps
for _ in range(n_steps):
head.discretize(n_steps, dummy_actions)
assert vqvae.discretized.device.type == torch.device(DEVICE).type, (
"vqvae_model.discretized was moved off the model device after discretize(). "
"Use .fill_(True) instead of = torch.tensor(True) to keep the buffer on device."
)
assert vqvae.vq_layer.freeze_codebook.device.type == torch.device(DEVICE).type, (
"vq_layer.freeze_codebook was moved off the model device after discretize(). "
"Use .fill_(True) instead of = torch.tensor(True) to keep the buffer on device."
)