diff --git a/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py b/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py
index b65b08b6a..f434d9b9e 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py
@@ -120,33 +120,56 @@ class InterjectionsAndSpeechModule:
         record: EpisodeRecord,
         subtask_spans: Sequence[dict[str, Any]],
     ) -> list[dict[str, Any]]:
+        """Generate interjections aligned with the actual demo trajectory.
+
+        Teleop data is frozen — the robot already executed every step in
+        the video. A *counterfactual* interjection like "actually skip
+        the wipe" contradicts what then happens in the video, which is
+        what qwen36moe-10/11 surfaced as low-quality interjections.
+
+        Instead, anchor every interjection at a subtask boundary and
+        write it as a natural user request for the *upcoming* subtask.
+        The robot's visible next behavior IS the interjection's effect,
+        so the training signal stays consistent: interjection text →
+        plan refresh → action stream all line up.
+        """
         if self.config.max_interjections_per_episode <= 0:
             return []
+        if len(subtask_spans) < 2:
+            # Need at least one transition (subtask 0 → subtask 1).
+            return []
         # Deterministic per-episode RNG so reruns are stable across SLURM jobs.
         rng = random.Random(f"{self.seed}:{record.episode_index}:interjection")
-        candidate_ts = [t for t in record.frame_timestamps if t >= self.config.interjection_min_t]
-        if not candidate_ts:
+
+        # Boundaries: the start time of every subtask except the first
+        # (which is just t0 and is covered by the initial-task speech atom).
+        boundaries: list[tuple[float, str, str]] = []
+        for i in range(1, len(subtask_spans)):
+            ts = float(subtask_spans[i]["start"])
+            if ts < self.config.interjection_min_t:
+                continue
+            prev_text = (subtask_spans[i - 1].get("text") or "").strip()
+            next_text = (subtask_spans[i].get("text") or "").strip()
+            if not next_text:
+                continue
+            boundaries.append((ts, prev_text, next_text))
+        if not boundaries:
             return []
-        # Pick at most ``max_interjections_per_episode`` distinct timestamps.
-        # Previously capped at ``len(candidate_ts) // 4`` — that floor was
-        # only relevant for very short episodes; for any real ~20-30s
-        # episode it had no effect, but it silently set the count to 0 on
-        # short fixtures. Just take ``min(max, len)`` directly.
-        n = min(self.config.max_interjections_per_episode, len(candidate_ts))
-        if n <= 0:
-            return []
-        chosen = sorted(rng.sample(candidate_ts, n))
+
+        n = min(self.config.max_interjections_per_episode, len(boundaries))
+        chosen = sorted(rng.sample(boundaries, n), key=lambda b: b[0])
 
         out: list[dict[str, Any]] = []
-        for t in chosen:
+        for t, prev_subtask, next_subtask in chosen:
             t_snap = _snap_to_frame(t, record.frame_timestamps)
+            # Window straddles the boundary so the VLM sees the end of the
+            # previous subtask and the start of the next one — same
+            # conditioning the policy will see at training time.
             window_ts = self._window_timestamps(t_snap, record.frame_timestamps)
-            current_subtask = (
-                self._subtask_at(subtask_spans, t_snap) or record.episode_task
-            )
             prompt = load_prompt("module_2_interjection").format(
                 episode_task=record.episode_task,
-                current_subtask=current_subtask,
+                prev_subtask=prev_subtask or "(starting from initial state)",
+                next_subtask=next_subtask,
                 timestamp=t_snap,
                 window_seconds=self.config.interjection_window_seconds,
             )
@@ -177,13 +200,14 @@ class InterjectionsAndSpeechModule:
     def _window_timestamps(
         self, t_anchor: float, frame_timestamps: Sequence[float]
     ) -> list[float]:
-        """Return a small set of frame timestamps spanning the lead-up to ``t``.
+        """Return a small set of frame timestamps centered on ``t_anchor``.
 
-        The VLM receives roughly ``num_frames`` frames over the
-        ``window_seconds`` immediately before ``t_anchor``, snapped to
-        actual source frame timestamps. This gives the interjection
-        prompt enough temporal context to read what's visibly happening
-        instead of looking at one frozen frame.
+        The window straddles the subtask boundary the interjection sits
+        on: roughly half the frames cover the end of the previous
+        subtask, half cover the start of the next one. The VLM therefore
+        sees BOTH what just finished AND what's about to start, which is
+        the conditioning we need to write a natural "now please do X"
+        request that matches the visible upcoming behavior.
         """
         if not frame_timestamps:
             return [t_anchor]
@@ -192,11 +216,15 @@ class InterjectionsAndSpeechModule:
             return [t_anchor]
         window = float(self.config.interjection_window_seconds)
         step = window / max(1, n - 1)
-        targets = [t_anchor - step * (n - 1 - i) for i in range(n)]
+        # Center the window on the anchor so half lands before, half after.
+        start_offset = -window / 2.0
+        targets = [t_anchor + start_offset + step * i for i in range(n)]
+        last_ts = float(frame_timestamps[-1])
         snapped: list[float] = []
         seen: set[float] = set()
         for tgt in targets:
-            t = _snap_to_frame(max(0.0, tgt), frame_timestamps)
+            clamped = min(last_ts, max(0.0, tgt))
+            t = _snap_to_frame(clamped, frame_timestamps)
             if t not in seen:
                 seen.add(t)
                 snapped.append(t)
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt
index 600251516..d6f77883f 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt
@@ -1,34 +1,46 @@
-You are simulating a user mid-episode interruption for a robot doing:
-"{episode_task}".
+You are generating training data for a Hi Robot-style hierarchical
+robot policy. The robot in this demonstration has ALREADY executed
+every step shown in the video — we cannot retroactively change the
+action stream. To keep training data consistent with the video, the
+"interjection" must align with what the robot is *about to do next* in
+the demonstration, framed as a natural mid-task user request.
 
-The images above show roughly the last {window_seconds:.1f} seconds of the
-demonstration in chronological order. Read what the robot is actually
-doing right now and write an interruption that responds to that exact
-visible activity — not a generic one.
+The episode's overall task: "{episode_task}".
 
-Current subtask the robot is executing: {current_subtask}
-Time into episode: {timestamp:.2f}s
+The images above show roughly {window_seconds:.1f} seconds straddling a
+subtask boundary in the demonstration:
 
-Synthesize ONE realistic interruption the user might say at this moment,
-plus the robot's verbal acknowledgement.
+- Subtask the robot just finished: "{prev_subtask}"
+- Subtask the robot is about to start: "{next_subtask}"
+- Time into episode: {timestamp:.2f}s
 
-Context (Hi Robot, Shi 2025) — interjections fall into one of these
-scenario types:
-- negative task: "actually skip X" (where X is the visible current step)
-- situated correction: "that's not the right one, use the blue one"
-- specific constraint: "be more careful with that one"
-- preference: "could you also do Y after this"
+Write ONE interjection the user would naturally say at this moment to
+prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it
+like a real human mid-task remark — conversational, varied, sometimes
+just a nudge, sometimes a clarification, sometimes a small constraint
+that the upcoming motion happens to satisfy. Plus the robot's verbal
+acknowledgement.
 
-Interruption rules:
-- Must reference an object, motion, or sub-step that is visible in the
-  attached frames OR explicitly named in the current subtask. Do not
-  invent objects that aren't there.
-- Must change the plan in a non-trivial way (a new constraint, skipped
-  step, or correction).
+Hard rules:
+
+- The interjection MUST be consistent with the next subtask. The user
+  cannot ask for something different from what the robot then does in
+  the video. If you're tempted to say "actually skip X" or "do Y
+  instead", DO NOT — those would contradict the demonstration.
+- The interjection must reference an object, location, or action that
+  is plausible given the visible scene and the next subtask text.
 - One sentence each. Conversational, not robotic.
 
+Style examples (vary the phrasing — don't reuse these verbatim):
+  - "Now go ahead and {next_subtask}."
+  - "Great, can you {next_subtask} next?"
+  - "{next_subtask}, please."
+  - "Before you continue, please {next_subtask}."
+  - "Looking good — {next_subtask} now."
+  - "Okay, {next_subtask}."
+
 Output strictly valid JSON:
   {{
-    "interjection": "<single sentence the user says about what is visible right now>",
-    "speech":       "<single sentence the robot speaks back, acknowledging the change>"
+    "interjection": "<single sentence the user says, asking for the next subtask>",
+    "speech":       "<single sentence the robot speaks back, confirming and starting>"
   }}