diff --git a/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py b/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py index b65b08b6a..f434d9b9e 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py @@ -120,33 +120,56 @@ class InterjectionsAndSpeechModule: record: EpisodeRecord, subtask_spans: Sequence[dict[str, Any]], ) -> list[dict[str, Any]]: + """Generate interjections aligned with the actual demo trajectory. + + Teleop data is frozen — the robot already executed every step in + the video. A *counterfactual* interjection like "actually skip + the wipe" contradicts what then happens in the video, which is + what qwen36moe-10/11 surfaced as low-quality interjections. + + Instead, anchor every interjection at a subtask boundary and + write it as a natural user request for the *upcoming* subtask. + The robot's visible next behavior IS the interjection's effect, + so the training signal stays consistent: interjection text → + plan refresh → action stream all line up. + """ if self.config.max_interjections_per_episode <= 0: return [] + if len(subtask_spans) < 2: + # Need at least one transition (subtask 0 → subtask 1). + return [] # Deterministic per-episode RNG so reruns are stable across SLURM jobs. rng = random.Random(f"{self.seed}:{record.episode_index}:interjection") - candidate_ts = [t for t in record.frame_timestamps if t >= self.config.interjection_min_t] - if not candidate_ts: + + # Boundaries: the start time of every subtask except the first + # (which is just t0 and is covered by the initial-task speech atom). + boundaries: list[tuple[float, str, str]] = [] + for i in range(1, len(subtask_spans)): + ts = float(subtask_spans[i]["start"]) + if ts < self.config.interjection_min_t: + continue + prev_text = (subtask_spans[i - 1].get("text") or "").strip() + next_text = (subtask_spans[i].get("text") or "").strip() + if not next_text: + continue + boundaries.append((ts, prev_text, next_text)) + if not boundaries: return [] - # Pick at most ``max_interjections_per_episode`` distinct timestamps. - # Previously capped at ``len(candidate_ts) // 4`` — that floor was - # only relevant for very short episodes; for any real ~20-30s - # episode it had no effect, but it silently set the count to 0 on - # short fixtures. Just take ``min(max, len)`` directly. - n = min(self.config.max_interjections_per_episode, len(candidate_ts)) - if n <= 0: - return [] - chosen = sorted(rng.sample(candidate_ts, n)) + + n = min(self.config.max_interjections_per_episode, len(boundaries)) + chosen = sorted(rng.sample(boundaries, n), key=lambda b: b[0]) out: list[dict[str, Any]] = [] - for t in chosen: + for t, prev_subtask, next_subtask in chosen: t_snap = _snap_to_frame(t, record.frame_timestamps) + # Window straddles the boundary so the VLM sees the end of the + # previous subtask and the start of the next one — same + # conditioning the policy will see at training time. window_ts = self._window_timestamps(t_snap, record.frame_timestamps) - current_subtask = ( - self._subtask_at(subtask_spans, t_snap) or record.episode_task - ) prompt = load_prompt("module_2_interjection").format( episode_task=record.episode_task, - current_subtask=current_subtask, + prev_subtask=prev_subtask or "(starting from initial state)", + next_subtask=next_subtask, timestamp=t_snap, window_seconds=self.config.interjection_window_seconds, ) @@ -177,13 +200,14 @@ class InterjectionsAndSpeechModule: def _window_timestamps( self, t_anchor: float, frame_timestamps: Sequence[float] ) -> list[float]: - """Return a small set of frame timestamps spanning the lead-up to ``t``. + """Return a small set of frame timestamps centered on ``t_anchor``. - The VLM receives roughly ``num_frames`` frames over the - ``window_seconds`` immediately before ``t_anchor``, snapped to - actual source frame timestamps. This gives the interjection - prompt enough temporal context to read what's visibly happening - instead of looking at one frozen frame. + The window straddles the subtask boundary the interjection sits + on: roughly half the frames cover the end of the previous + subtask, half cover the start of the next one. The VLM therefore + sees BOTH what just finished AND what's about to start, which is + the conditioning we need to write a natural "now please do X" + request that matches the visible upcoming behavior. """ if not frame_timestamps: return [t_anchor] @@ -192,11 +216,15 @@ class InterjectionsAndSpeechModule: return [t_anchor] window = float(self.config.interjection_window_seconds) step = window / max(1, n - 1) - targets = [t_anchor - step * (n - 1 - i) for i in range(n)] + # Center the window on the anchor so half lands before, half after. + start_offset = -window / 2.0 + targets = [t_anchor + start_offset + step * i for i in range(n)] + last_ts = float(frame_timestamps[-1]) snapped: list[float] = [] seen: set[float] = set() for tgt in targets: - t = _snap_to_frame(max(0.0, tgt), frame_timestamps) + clamped = min(last_ts, max(0.0, tgt)) + t = _snap_to_frame(clamped, frame_timestamps) if t not in seen: seen.add(t) snapped.append(t) diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt index 600251516..d6f77883f 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt @@ -1,34 +1,46 @@ -You are simulating a user mid-episode interruption for a robot doing: -"{episode_task}". +You are generating training data for a Hi Robot-style hierarchical +robot policy. The robot in this demonstration has ALREADY executed +every step shown in the video — we cannot retroactively change the +action stream. To keep training data consistent with the video, the +"interjection" must align with what the robot is *about to do next* in +the demonstration, framed as a natural mid-task user request. -The images above show roughly the last {window_seconds:.1f} seconds of the -demonstration in chronological order. Read what the robot is actually -doing right now and write an interruption that responds to that exact -visible activity — not a generic one. +The episode's overall task: "{episode_task}". -Current subtask the robot is executing: {current_subtask} -Time into episode: {timestamp:.2f}s +The images above show roughly {window_seconds:.1f} seconds straddling a +subtask boundary in the demonstration: -Synthesize ONE realistic interruption the user might say at this moment, -plus the robot's verbal acknowledgement. +- Subtask the robot just finished: "{prev_subtask}" +- Subtask the robot is about to start: "{next_subtask}" +- Time into episode: {timestamp:.2f}s -Context (Hi Robot, Shi 2025) — interjections fall into one of these -scenario types: -- negative task: "actually skip X" (where X is the visible current step) -- situated correction: "that's not the right one, use the blue one" -- specific constraint: "be more careful with that one" -- preference: "could you also do Y after this" +Write ONE interjection the user would naturally say at this moment to +prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it +like a real human mid-task remark — conversational, varied, sometimes +just a nudge, sometimes a clarification, sometimes a small constraint +that the upcoming motion happens to satisfy. Plus the robot's verbal +acknowledgement. -Interruption rules: -- Must reference an object, motion, or sub-step that is visible in the - attached frames OR explicitly named in the current subtask. Do not - invent objects that aren't there. -- Must change the plan in a non-trivial way (a new constraint, skipped - step, or correction). +Hard rules: + +- The interjection MUST be consistent with the next subtask. The user + cannot ask for something different from what the robot then does in + the video. If you're tempted to say "actually skip X" or "do Y + instead", DO NOT — those would contradict the demonstration. +- The interjection must reference an object, location, or action that + is plausible given the visible scene and the next subtask text. - One sentence each. Conversational, not robotic. +Style examples (vary the phrasing — don't reuse these verbatim): + - "Now go ahead and {next_subtask}." + - "Great, can you {next_subtask} next?" + - "{next_subtask}, please." + - "Before you continue, please {next_subtask}." + - "Looking good — {next_subtask} now." + - "Okay, {next_subtask}." + Output strictly valid JSON: {{ - "interjection": "", - "speech": "" + "interjection": "", + "speech": "" }}