Files
lerobot/tests/policies/test_sarm_subtask_annotations.py
T
Pepijn f04958527e Add sarm (#2639)
* add initial modeling

* make rewind pretrained policy

* add annotation

* small fix

* add sarm

* subtasks

* fix spawn

* fix rewind discrepancies

* Add script to generate embedding for dataset (#2138)

* Add generate and validate script

* fix precommit

* Improve generate embeddings function by using dataset tools (#2206)

---------

Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>

* cleanup

* change order train log

* print batch size

* update sarm processor

* add reward output

* change expected features

* add image validation

* change validation

* get state input from dataset stats

* raise if no state key is found

* pass stats

* cleanup and refactor

* add episode inddex to complementary data

* add subtask init and detection

* revert lerobot_train changes

* pass dataset metadata to policy

* change loadig subtasks

* add small logging

* fix progress conversion and adding initial frame

* use large offset for initial frame (ugly)

* Remove rewind, use clip tokenizer

* add tests, implement formula 1,2 correctly and cleanup

* use task from dataset, cleanup visualizer

* simplify

* simplify and cleanup code and move compute_temporal_proportions to utils

* fix normalization in visualization

* Fix visualization and change prompt

* fix formatting

* add visualize subtask annotations

* use qwen thinking

* try different prompt

* format

* update prompt

* higher temp, long output

* different settings

* use instruct

* show full resp

* split message

* Temp: increase tolerance dataset

* Fix RA-BC (#2572)

* Add next observation loading for RA-BC progress deltas

* Compute weights based on temporal progress deltas instead of static rewards

* Add hard-masking for negative progress deltas in weight computation

* Feat/add dual head (#2582)

* Add dual dense sparse head and annotation

* Add docs

* add dual to procesor

* cleanup

* change sampling in visualize and cleanup

* remove validation

* remove compile

* Feat/test uniform (#2587)

* test uniform

* add different string for misaligned

* Fix rewind and add tests

* uncomment text implementation

* run precommit

* Add head mode for ra-bc

* fix visalization of single task

* add

* return per sample loss

* Fix RA_BC (#2602)

* update rabc implementation

* compute rabc beforehand

* fix import

* add only progress calulation

* use precomputed progress

* multi gpu processing

* import

* fix dataset meta data extraction

* add logging

* logging

* log

* progress per episode

* split differently

* move clip to gpu

* pre decode frames for an episode

* fix cuda initalization

* fix import

* multi processing

* rename

* fix import

* fix

* fix rabc

* use last known progress if oob

* use last known progress if oob

* add misalignment loss with random embeddings

* discard previous changes

* add selection of models to docs for ra_bc

* add transformers dep

* extend tolerance

* initial commit with new codebase

* add tests

* fix

* remove temporal sampler

* drop last frame for sampler

* use original ref

* some fixes

* fix visualization

* remove smoothing and fix order subtasks

* add stride rabc computation

* add push to hub

* add explanation

* add kappa expllaination

* better rabc logging

* feedback pr

* remove dataset tolerance

* revert dataset tool

* revert dataset changes

* add credit

* run precommit

* change path for generate ra_bc

* fix type

* include sarm in all in pyproject

* fix precommit

* lazy import matplotlib

* lazy import qwen

* remove rich console

* skip if transformers is not installed?

* run only when we have faker

* place transformer lazy loading

* Dont test if low transformer version

* fix

* increase transformer

* increase as 4.57.0 is yanked

* remove pi from all

* go back

---------

Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>
Co-authored-by: s1lent4gnt <kmeftah.khalil@gmail.com>
2025-12-18 12:50:32 +01:00

135 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
pytest.importorskip("transformers")
from lerobot.data_processing.sarm_annotations.subtask_annotation import (
Subtask,
SubtaskAnnotation,
Timestamp,
compute_temporal_proportions,
)
def make_annotation(subtasks: list[tuple[str, int, int]]) -> SubtaskAnnotation:
"""Helper to create SubtaskAnnotation from list of (name, start_sec, end_sec)."""
return SubtaskAnnotation(
subtasks=[
Subtask(
name=name,
timestamps=Timestamp(
start=f"{start // 60:02d}:{start % 60:02d}", end=f"{end // 60:02d}:{end % 60:02d}"
),
)
for name, start, end in subtasks
]
)
class TestComputeTemporalProportions:
"""Tests for compute_temporal_proportions (SARM Paper Formula 1).
Formula: ᾱ_k = (1/M) × Σ_i (L_{i,k} / T_i)
Key insight: This averages the PROPORTION of each subtask within each trajectory,
giving equal weight to all trajectories regardless of absolute length.
"""
def test_basic_two_trajectories_equal_proportions(self):
"""Test with two trajectories that have equal proportions."""
# Both trajectories: subtask1 = 50%, subtask2 = 50%
# Traj 1: T=100s, subtask1=50s, subtask2=50s
# Traj 2: T=200s, subtask1=100s, subtask2=100s
annotations = {
0: make_annotation([("subtask1", 0, 50), ("subtask2", 50, 100)]),
1: make_annotation([("subtask1", 0, 100), ("subtask2", 100, 200)]),
}
result = compute_temporal_proportions(annotations)
# Both should be 0.5
assert abs(result["subtask1"] - 0.5) < 1e-6
assert abs(result["subtask2"] - 0.5) < 1e-6
def test_paper_example_different_from_avg_durations(self):
"""Test that compute_temporal_proportions differs from naive average duration approach.
This is the key test showing the difference between:
- Paper formula: average of (L_i,k / T_i)
- Naive approach: mean(L_i,k) / sum(mean(L_i,j))
"""
# Episode 1: T=100s, subtask1=80s, subtask2=20s (proportions: 0.8, 0.2)
# Episode 2: T=200s, subtask1=40s, subtask2=160s (proportions: 0.2, 0.8)
annotations = {
0: make_annotation([("subtask1", 0, 80), ("subtask2", 80, 100)]),
1: make_annotation([("subtask1", 0, 40), ("subtask2", 40, 200)]),
}
result = compute_temporal_proportions(annotations)
# Paper formula:
# ᾱ_1 = (1/2) × (80/100 + 40/200) = (1/2) × (0.8 + 0.2) = 0.5
# ᾱ_2 = (1/2) × (20/100 + 160/200) = (1/2) × (0.2 + 0.8) = 0.5
assert abs(result["subtask1"] - 0.5) < 1e-6
assert abs(result["subtask2"] - 0.5) < 1e-6
def test_single_trajectory(self):
"""Test with a single trajectory."""
# T=100s, reach=30s, grasp=20s, lift=50s
annotations = {
0: make_annotation([("reach", 0, 30), ("grasp", 30, 50), ("lift", 50, 100)]),
}
result = compute_temporal_proportions(annotations)
assert abs(result["reach"] - 0.3) < 1e-6
assert abs(result["grasp"] - 0.2) < 1e-6
assert abs(result["lift"] - 0.5) < 1e-6
def test_sum_to_one(self):
"""Test that proportions always sum to 1."""
# Three episodes with varying proportions
annotations = {
0: make_annotation([("a", 0, 10), ("b", 10, 50), ("c", 50, 100)]), # 0.1, 0.4, 0.5
1: make_annotation([("a", 0, 20), ("b", 20, 70), ("c", 70, 100)]), # 0.2, 0.5, 0.3
2: make_annotation([("a", 0, 30), ("b", 30, 90), ("c", 90, 100)]), # 0.3, 0.6, 0.1
}
result = compute_temporal_proportions(annotations)
total = sum(result.values())
assert abs(total - 1.0) < 1e-6
def test_empty_annotations_returns_empty(self):
"""Test that empty annotations returns empty dict."""
result = compute_temporal_proportions({})
assert result == {}
def test_uniform_proportions(self):
"""Test with uniform proportions across subtasks."""
# Each subtask takes 25% of each episode
annotations = {
0: make_annotation([("a", 0, 25), ("b", 25, 50), ("c", 50, 75), ("d", 75, 100)]),
1: make_annotation([("a", 0, 50), ("b", 50, 100), ("c", 100, 150), ("d", 150, 200)]),
}
result = compute_temporal_proportions(annotations)
for name in ["a", "b", "c", "d"]:
assert abs(result[name] - 0.25) < 1e-6