mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-11 14:49:43 +00:00
test(annotate): unstale the two failing module tests
Both tests were stale relative to design changes that landed earlier on this branch. Update the tests to match the current production contract. **``test_module1_attaches_video_block_to_subtask_prompt``** The test took ``captured[0]`` and asserted on its content blocks, but Module 1 issues several sub-prompts and the rephrasings call (which is text-only, no video block) usually lands first. Two fixes: * The test's intent is "the subtask prompt carries the video block" — not "the first prompt carries it". Pick the call by content (``"atomic subtasks"`` keyword in the text block) so the test is resilient to future reordering of unrelated sub-prompts. * Set ``n_task_rephrasings=0`` so the rephrasings call is skipped entirely — keeps the test focused on ``_generate_subtasks``. **``test_module2_mid_episode_emits_paired_interjection_and_speech``** Two issues both rooted in design changes on the branch: 1. ``InterjectionsAndSpeechModule._mid_episode_interjections`` now anchors interjections on subtask boundaries from Module 1's staging tree, bailing out with zero rows when no spans exist. The production executor runs Module 1 first; the test ran Module 2 in isolation. Reproduce the contract by seeding two ``style=subtask`` rows in the staging before calling Module 2 — gives it the single ``0 → 1`` boundary it needs. 2. The test's stub responder used the marker ``"ONE realistic interruption"`` to match the interjection prompt, but that string is from a previous prompt version. The current ``module_2_interjection.txt`` says ``"Write ONE interjection..."`` — the old prompt asked for counterfactual interjections (e.g. "skip the wipe"), the new one anchors on the upcoming subtask. Marker updated to ``"Write ONE interjection"``; canned response wording aligned to the new design. Sweep on the language stack: 66 passed, 0 failed (was 64 passed, 2 failed). Pre-commit clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -126,12 +126,26 @@ def test_module2_at_t0_emits_speech_only_no_interjection(fixture_dataset_root: P
|
||||
def test_module2_mid_episode_emits_paired_interjection_and_speech(
|
||||
fixture_dataset_root: Path, tmp_path: Path
|
||||
) -> None:
|
||||
"""Module 2 anchors interjections on Module 1's subtask boundaries.
|
||||
|
||||
The executor runs Module 1 first, then Module 2 reads the subtask
|
||||
rows back from the same staging tree (see
|
||||
``_mid_episode_interjections``). Reproduce that contract here by
|
||||
seeding the staging with two subtask rows so a single ``0 → 1``
|
||||
boundary exists for Module 2 to anchor on.
|
||||
"""
|
||||
vlm = make_canned_responder(
|
||||
{
|
||||
"acknowledgement the robot": {"text": "OK."},
|
||||
"ONE realistic interruption": {
|
||||
"interjection": "actually skip the dishes",
|
||||
"speech": "Skipping the dishes.",
|
||||
# Marker matches the distinctive line of
|
||||
# ``module_2_interjection.txt``. The old marker
|
||||
# ("ONE realistic interruption") came from a previous prompt
|
||||
# version that asked for counterfactual interjections; the
|
||||
# current design anchors on subtask boundaries instead, so
|
||||
# the prompt and its marker changed.
|
||||
"Write ONE interjection": {
|
||||
"interjection": "now wipe the counter please",
|
||||
"speech": "On it.",
|
||||
},
|
||||
},
|
||||
)
|
||||
@@ -142,6 +156,29 @@ def test_module2_mid_episode_emits_paired_interjection_and_speech(
|
||||
)
|
||||
record = next(iter_episodes(fixture_dataset_root))
|
||||
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
||||
# Seed Module 1's subtask staging so Module 2 has a boundary to
|
||||
# anchor on (it bails with zero rows when no spans exist — the
|
||||
# production executor guarantees Module 1 ran first).
|
||||
boundary_ts = float(record.frame_timestamps[len(record.frame_timestamps) // 2])
|
||||
staging.write(
|
||||
"module_1",
|
||||
[
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "grasp the sponge",
|
||||
"style": "subtask",
|
||||
"timestamp": float(record.frame_timestamps[0]),
|
||||
"tool_calls": None,
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "wipe the counter",
|
||||
"style": "subtask",
|
||||
"timestamp": boundary_ts,
|
||||
"tool_calls": None,
|
||||
},
|
||||
],
|
||||
)
|
||||
module.run_episode(record, staging)
|
||||
rows = staging.read("module_2")
|
||||
|
||||
@@ -163,9 +200,7 @@ def test_module3_vqa_unique_per_frame_and_camera(single_episode_root: Path, tmp_
|
||||
vlm=vlm,
|
||||
config=Module3Config(vqa_emission_hz=1.0, K=3),
|
||||
seed=1,
|
||||
frame_provider=_StubFrameProvider(
|
||||
cameras=("observation.images.top", "observation.images.wrist")
|
||||
),
|
||||
frame_provider=_StubFrameProvider(cameras=("observation.images.top", "observation.images.wrist")),
|
||||
)
|
||||
record = next(iter_episodes(single_episode_root))
|
||||
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
||||
@@ -176,13 +211,9 @@ def test_module3_vqa_unique_per_frame_and_camera(single_episode_root: Path, tmp_
|
||||
assert r["style"] == "vqa"
|
||||
assert r.get("camera") in {"observation.images.top", "observation.images.wrist"}
|
||||
# at most one (vqa, user) and one (vqa, assistant) per (timestamp, camera)
|
||||
user_keys = [
|
||||
(r["timestamp"], r["camera"]) for r in rows if r["role"] == "user" and r["style"] == "vqa"
|
||||
]
|
||||
user_keys = [(r["timestamp"], r["camera"]) for r in rows if r["role"] == "user" and r["style"] == "vqa"]
|
||||
assistant_keys = [
|
||||
(r["timestamp"], r["camera"])
|
||||
for r in rows
|
||||
if r["role"] == "assistant" and r["style"] == "vqa"
|
||||
(r["timestamp"], r["camera"]) for r in rows if r["role"] == "assistant" and r["style"] == "vqa"
|
||||
]
|
||||
assert len(user_keys) == len(set(user_keys))
|
||||
assert len(assistant_keys) == len(set(assistant_keys))
|
||||
@@ -222,17 +253,32 @@ def test_module1_attaches_video_block_to_subtask_prompt(fixture_dataset_root: Pa
|
||||
provider = _StubFrameProvider()
|
||||
module = PlanSubtasksMemoryModule(
|
||||
vlm=StubVlmClient(responder=responder),
|
||||
config=Module1Config(max_video_frames=5, frames_per_second=10.0),
|
||||
# Disable the rephrasings sub-prompt so the test's only video-bearing
|
||||
# call is the subtask one — keeps the assertions below focused on
|
||||
# ``_generate_subtasks`` rather than fighting the order of unrelated
|
||||
# text-only Module-1 sub-prompts.
|
||||
config=Module1Config(max_video_frames=5, frames_per_second=10.0, n_task_rephrasings=0),
|
||||
frame_provider=provider,
|
||||
)
|
||||
record = next(iter_episodes(fixture_dataset_root))
|
||||
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
|
||||
module.run_episode(record, staging)
|
||||
|
||||
# the subtask call (the first VLM call) must carry exactly one video block
|
||||
# Find the call carrying the subtask prompt rather than blindly taking
|
||||
# captured[0] — Module 1 issues several sub-prompts and their order is
|
||||
# not part of the contract.
|
||||
assert captured, "no VLM calls made"
|
||||
first_call = captured[0]
|
||||
content = first_call[0]["content"]
|
||||
|
||||
def _prompt_text(messages):
|
||||
for m in messages:
|
||||
for block in m.get("content", []):
|
||||
if isinstance(block, dict) and block.get("type") == "text":
|
||||
return block.get("text", "")
|
||||
return ""
|
||||
|
||||
subtask_calls = [m for m in captured if "atomic subtasks" in _prompt_text(m)]
|
||||
assert len(subtask_calls) == 1, "expected exactly one subtask-prompt VLM call"
|
||||
content = subtask_calls[0][0]["content"]
|
||||
video_blocks = [b for b in content if isinstance(b, dict) and b.get("type") == "video"]
|
||||
image_blocks = [b for b in content if isinstance(b, dict) and b.get("type") == "image"]
|
||||
text_blocks = [b for b in content if isinstance(b, dict) and b.get("type") == "text"]
|
||||
|
||||
Reference in New Issue
Block a user