diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 01ef58f4d..f669593e9 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -53,13 +53,30 @@ CMD = (
     "--executor.episode_parallelism=16 "
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
     "--vlm.camera_key=observation.images.robot0_agentview_right "
-    # Phase 1 — plan module (subtasks + plan + memory + task_aug).
+    # Phase 1 — plan module (subtasks + plan + memory).
     "--plan.frames_per_second=1.0 "
     "--plan.use_video_url=true "
     "--plan.use_video_url_fps=1.0 "
-    "--plan.derive_task_from_video=always "
-    "--plan.task_aug_axes.enabled=true "
-    "--plan.action_records.enabled=true "
+    # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the
+    # stove", "Pick the mug...") is authoritative and is what eval uses.
+    # ``derive_task_from_video=off`` keeps that canonical task driving
+    # subtask generation. Do NOT use ``always`` here — it throws the real
+    # task away, asks the VLM "what is this video about?" with no hint,
+    # and the hallucinated task then poisons every subtask + plan row.
+    "--plan.derive_task_from_video=off "
+    # NO task augmentation for RoboCasa: eval conditions on the exact task
+    # strings, so synthetic rephrasings are unused at best and (when they
+    # drift, e.g. "wander around the kitchen") harmful. 0 rephrasings +
+    # axes disabled = the policy only ever sees the canonical task.
+    "--plan.n_task_rephrasings=0 "
+    # action_records OFF: the structured {verb,object,arm,grasp,dest}
+    # schema is a manipulation schema; RoboCasa navigation / atomic tasks
+    # don't fit it and the VLM hallucinates (e.g. "move stove to stove").
+    # Leave off unless annotating long composite manipulation tasks you've
+    # verified render cleanly (and even then replace_subtask_text stays
+    # off by default so records are additive, never overwriting subtasks).
+    # Keep subtask decomposition tight for atomic tasks:
+    "--plan.plan_max_steps=6 "
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index c60e58fee..a3c1306be 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -94,12 +94,18 @@ class ActionRecordsConfig:
 
     A deterministic Python template then renders the record back to
     canonical subtask text (e.g. ``pick blue cube with left arm using
-    pinch grip``). When ``replace_subtask_text=True`` (default), the
-    rendered text REPLACES the VLM's free-form subtask text — eliminating
-    cross-episode phrasing drift. When ``emit_record_row=True``
-    (default), the structured record is also emitted as a row with
-    ``style="action_record"`` so downstream consumers can train on the
-    typed schema directly.
+    pinch grip``). When ``replace_subtask_text=True``, the rendered text
+    REPLACES the VLM's free-form subtask text. This is OFF by default:
+    the structured fields are easy for the VLM to hallucinate on tasks
+    that don't fit the manipulation schema (e.g. navigation tasks yield
+    nonsense like ``move stove to stove``), and silently overwriting the
+    subtask text with a reconstruction is high-risk. Leave it off to keep
+    the original VLM subtask text and treat the record as additive
+    metadata; only flip it on for datasets you've verified render
+    cleanly. When ``emit_record_row=True`` (default), the structured
+    record is also emitted as a row with ``style="action_record"`` so
+    downstream consumers can train on the typed schema directly —
+    without touching the subtask text.
 
     Cost: one extra VLM call per subtask. For an 8-subtask episode this
     means ~8x more VLM calls in the plan module — still cheap relative
@@ -110,9 +116,11 @@ class ActionRecordsConfig:
 
     # When True, replace the VLM-generated subtask text with the
     # deterministic template's rendering of the structured record.
-    # Strongly recommended — it's the whole point of the structured
-    # intermediate. Set False to keep both representations side by side.
-    replace_subtask_text: bool = True
+    # OFF by default — see class docstring. Overwriting good subtask
+    # text with a reconstruction of hallucinated structured fields is
+    # high-risk (navigation / non-manipulation tasks render to
+    # nonsense). Keep records additive (``emit_record_row``) instead.
+    replace_subtask_text: bool = False
 
     # When True, emit a separate row with ``style="action_record"`` and
     # ``content=json.dumps(record)`` at the subtask's start timestamp.
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index f58ec2c91..6ef5352b0 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -424,6 +424,13 @@ class PlanSubtasksMemoryModule:
         if not verb:
             return ""
 
+        # Drop a degenerate destination that just echoes the object — the
+        # VLM sometimes fills both with the same noun (e.g. navigation:
+        # ``verb=move object=stove destination=stove`` → "move stove to
+        # stove"). Treat that as "no meaningful destination".
+        if dest and obj and dest.strip().lower() == obj.strip().lower():
+            dest = ""
+
         parts: list[str] = [verb]
         if obj:
             parts.append(obj)
@@ -431,7 +438,7 @@ class PlanSubtasksMemoryModule:
             # Pick a sensible preposition per verb family.
             if verb in {"place", "put", "drop", "insert", "pour", "dump"}:
                 parts.append(f"in {dest}")
-            elif verb in {"move", "transport", "reach"}:
+            elif verb in {"move", "transport", "reach", "navigate"}:
                 parts.append(f"to {dest}")
             else:
                 parts.append(f"at {dest}")
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
index d8cd13104..8b19a0a8e 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
@@ -37,9 +37,16 @@ Axes and target counts:
     orientation, grasp_method) appear in the original task.
 
 Hard rules:
-- Each variant MUST preserve the core action and the target object.
-  Do not change which object is involved, the destination, or the
-  high-level action.
+- Each variant MUST preserve the core action, the target object, AND
+  the goal / destination. Do not change which object is involved, where
+  it goes, or the high-level action. "Navigate to the stove" may become
+  "go to the stove" or "head over to the stove" — it must NEVER become
+  "wander around the kitchen", "explore the room", or anything that
+  drops or generalises the stove destination. If you cannot vary the
+  wording without changing the goal, emit fewer variants.
+- Only the FIVE listed elements (wording, arm, orientation, grasp
+  method, or a combination) may be varied or omitted. The verb's
+  meaning, the object, and the destination are fixed.
 - Each variant is plain prose, no markdown, no quotes, no list numbers.
 - Each variant must be DISTINCT from every other variant in the entire
   output, both within and across axes. Near-duplicates are not allowed.