mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 20:19:43 +00:00
test(annotate): unstale the two failing module tests
Both tests were stale relative to design changes that landed earlier on this branch. Update the tests to match the current production contract. **``test_module1_attaches_video_block_to_subtask_prompt``** The test took ``captured[0]`` and asserted on its content blocks, but Module 1 issues several sub-prompts and the rephrasings call (which is text-only, no video block) usually lands first. Two fixes: * The test's intent is "the subtask prompt carries the video block" — not "the first prompt carries it". Pick the call by content (``"atomic subtasks"`` keyword in the text block) so the test is resilient to future reordering of unrelated sub-prompts. * Set ``n_task_rephrasings=0`` so the rephrasings call is skipped entirely — keeps the test focused on ``_generate_subtasks``. **``test_module2_mid_episode_emits_paired_interjection_and_speech``** Two issues both rooted in design changes on the branch: 1. ``InterjectionsAndSpeechModule._mid_episode_interjections`` now anchors interjections on subtask boundaries from Module 1's staging tree, bailing out with zero rows when no spans exist. The production executor runs Module 1 first; the test ran Module 2 in isolation. Reproduce the contract by seeding two ``style=subtask`` rows in the staging before calling Module 2 — gives it the single ``0 → 1`` boundary it needs. 2. The test's stub responder used the marker ``"ONE realistic interruption"`` to match the interjection prompt, but that string is from a previous prompt version. The current ``module_2_interjection.txt`` says ``"Write ONE interjection..."`` — the old prompt asked for counterfactual interjections (e.g. "skip the wipe"), the new one anchors on the upcoming subtask. Marker updated to ``"Write ONE interjection"``; canned response wording aligned to the new design. Sweep on the language stack: 66 passed, 0 failed (was 64 passed, 2 failed). Pre-commit clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -126,12 +126,26 @@ def test_module2_at_t0_emits_speech_only_no_interjection(fixture_dataset_root: P
|
|||||||
def test_module2_mid_episode_emits_paired_interjection_and_speech(
|
def test_module2_mid_episode_emits_paired_interjection_and_speech(
|
||||||
fixture_dataset_root: Path, tmp_path: Path
|
fixture_dataset_root: Path, tmp_path: Path
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Module 2 anchors interjections on Module 1's subtask boundaries.
|
||||||
|
|
||||||
|
The executor runs Module 1 first, then Module 2 reads the subtask
|
||||||
|
rows back from the same staging tree (see
|
||||||
|
``_mid_episode_interjections``). Reproduce that contract here by
|
||||||
|
seeding the staging with two subtask rows so a single ``0 → 1``
|
||||||
|
boundary exists for Module 2 to anchor on.
|
||||||
|
"""
|
||||||
vlm = make_canned_responder(
|
vlm = make_canned_responder(
|
||||||
{
|
{
|
||||||
"acknowledgement the robot": {"text": "OK."},
|
"acknowledgement the robot": {"text": "OK."},
|
||||||
"ONE realistic interruption": {
|
# Marker matches the distinctive line of
|
||||||
"interjection": "actually skip the dishes",
|
# ``module_2_interjection.txt``. The old marker
|
||||||
"speech": "Skipping the dishes.",
|
# ("ONE realistic interruption") came from a previous prompt
|
||||||
|
# version that asked for counterfactual interjections; the
|
||||||
|
# current design anchors on subtask boundaries instead, so
|
||||||
|
# the prompt and its marker changed.
|
||||||
|
"Write ONE interjection": {
|
||||||
|
"interjection": "now wipe the counter please",
|
||||||
|
"speech": "On it.",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -142,6 +156,29 @@ def test_module2_mid_episode_emits_paired_interjection_and_speech(
|
|||||||
)
|
)
|
||||||
record = next(iter_episodes(fixture_dataset_root))
|
record = next(iter_episodes(fixture_dataset_root))
|
||||||
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
||||||
|
# Seed Module 1's subtask staging so Module 2 has a boundary to
|
||||||
|
# anchor on (it bails with zero rows when no spans exist — the
|
||||||
|
# production executor guarantees Module 1 ran first).
|
||||||
|
boundary_ts = float(record.frame_timestamps[len(record.frame_timestamps) // 2])
|
||||||
|
staging.write(
|
||||||
|
"module_1",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "grasp the sponge",
|
||||||
|
"style": "subtask",
|
||||||
|
"timestamp": float(record.frame_timestamps[0]),
|
||||||
|
"tool_calls": None,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "wipe the counter",
|
||||||
|
"style": "subtask",
|
||||||
|
"timestamp": boundary_ts,
|
||||||
|
"tool_calls": None,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
module.run_episode(record, staging)
|
module.run_episode(record, staging)
|
||||||
rows = staging.read("module_2")
|
rows = staging.read("module_2")
|
||||||
|
|
||||||
@@ -163,9 +200,7 @@ def test_module3_vqa_unique_per_frame_and_camera(single_episode_root: Path, tmp_
|
|||||||
vlm=vlm,
|
vlm=vlm,
|
||||||
config=Module3Config(vqa_emission_hz=1.0, K=3),
|
config=Module3Config(vqa_emission_hz=1.0, K=3),
|
||||||
seed=1,
|
seed=1,
|
||||||
frame_provider=_StubFrameProvider(
|
frame_provider=_StubFrameProvider(cameras=("observation.images.top", "observation.images.wrist")),
|
||||||
cameras=("observation.images.top", "observation.images.wrist")
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
record = next(iter_episodes(single_episode_root))
|
record = next(iter_episodes(single_episode_root))
|
||||||
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
||||||
@@ -176,13 +211,9 @@ def test_module3_vqa_unique_per_frame_and_camera(single_episode_root: Path, tmp_
|
|||||||
assert r["style"] == "vqa"
|
assert r["style"] == "vqa"
|
||||||
assert r.get("camera") in {"observation.images.top", "observation.images.wrist"}
|
assert r.get("camera") in {"observation.images.top", "observation.images.wrist"}
|
||||||
# at most one (vqa, user) and one (vqa, assistant) per (timestamp, camera)
|
# at most one (vqa, user) and one (vqa, assistant) per (timestamp, camera)
|
||||||
user_keys = [
|
user_keys = [(r["timestamp"], r["camera"]) for r in rows if r["role"] == "user" and r["style"] == "vqa"]
|
||||||
(r["timestamp"], r["camera"]) for r in rows if r["role"] == "user" and r["style"] == "vqa"
|
|
||||||
]
|
|
||||||
assistant_keys = [
|
assistant_keys = [
|
||||||
(r["timestamp"], r["camera"])
|
(r["timestamp"], r["camera"]) for r in rows if r["role"] == "assistant" and r["style"] == "vqa"
|
||||||
for r in rows
|
|
||||||
if r["role"] == "assistant" and r["style"] == "vqa"
|
|
||||||
]
|
]
|
||||||
assert len(user_keys) == len(set(user_keys))
|
assert len(user_keys) == len(set(user_keys))
|
||||||
assert len(assistant_keys) == len(set(assistant_keys))
|
assert len(assistant_keys) == len(set(assistant_keys))
|
||||||
@@ -222,17 +253,32 @@ def test_module1_attaches_video_block_to_subtask_prompt(fixture_dataset_root: Pa
|
|||||||
provider = _StubFrameProvider()
|
provider = _StubFrameProvider()
|
||||||
module = PlanSubtasksMemoryModule(
|
module = PlanSubtasksMemoryModule(
|
||||||
vlm=StubVlmClient(responder=responder),
|
vlm=StubVlmClient(responder=responder),
|
||||||
config=Module1Config(max_video_frames=5, frames_per_second=10.0),
|
# Disable the rephrasings sub-prompt so the test's only video-bearing
|
||||||
|
# call is the subtask one — keeps the assertions below focused on
|
||||||
|
# ``_generate_subtasks`` rather than fighting the order of unrelated
|
||||||
|
# text-only Module-1 sub-prompts.
|
||||||
|
config=Module1Config(max_video_frames=5, frames_per_second=10.0, n_task_rephrasings=0),
|
||||||
frame_provider=provider,
|
frame_provider=provider,
|
||||||
)
|
)
|
||||||
record = next(iter_episodes(fixture_dataset_root))
|
record = next(iter_episodes(fixture_dataset_root))
|
||||||
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
||||||
module.run_episode(record, staging)
|
module.run_episode(record, staging)
|
||||||
|
|
||||||
# the subtask call (the first VLM call) must carry exactly one video block
|
# Find the call carrying the subtask prompt rather than blindly taking
|
||||||
|
# captured[0] — Module 1 issues several sub-prompts and their order is
|
||||||
|
# not part of the contract.
|
||||||
assert captured, "no VLM calls made"
|
assert captured, "no VLM calls made"
|
||||||
first_call = captured[0]
|
|
||||||
content = first_call[0]["content"]
|
def _prompt_text(messages):
|
||||||
|
for m in messages:
|
||||||
|
for block in m.get("content", []):
|
||||||
|
if isinstance(block, dict) and block.get("type") == "text":
|
||||||
|
return block.get("text", "")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
subtask_calls = [m for m in captured if "atomic subtasks" in _prompt_text(m)]
|
||||||
|
assert len(subtask_calls) == 1, "expected exactly one subtask-prompt VLM call"
|
||||||
|
content = subtask_calls[0][0]["content"]
|
||||||
video_blocks = [b for b in content if isinstance(b, dict) and b.get("type") == "video"]
|
video_blocks = [b for b in content if isinstance(b, dict) and b.get("type") == "video"]
|
||||||
image_blocks = [b for b in content if isinstance(b, dict) and b.get("type") == "image"]
|
image_blocks = [b for b in content if isinstance(b, dict) and b.get("type") == "image"]
|
||||||
text_blocks = [b for b in content if isinstance(b, dict) and b.get("type") == "text"]
|
text_blocks = [b for b in content if isinstance(b, dict) and b.get("type") == "text"]
|
||||||
|
|||||||
Reference in New Issue
Block a user