From 1394a6ab5daaa0fc4eacfa5377b0f526d0a2c6bf Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 5 May 2026 14:32:05 +0200
Subject: [PATCH] chore(annotate): bump diversity knobs ~3x to fight
 memorisation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Following Pi0.7 §V (prompt expansion / diverse context conditioning),
push more atom variants per episode and higher VLM sampling
temperature so the training distribution has enough wording diversity
that the LM head is forced to use its parameters rather than memorise
specific (prompt, target) pairs.

Changes vs prior annotation pass:

  * vlm.temperature: 0.2 (default) → 0.7 — every Module-1/2/3 call
    now produces diverse phrasings; same prompt yields different
    completions across emissions.
  * module_1.n_task_rephrasings: 10 → 30 — three times as many
    ``task_aug`` rows in language_persistent. ``${task}`` already
    rotates through them deterministically per sample_idx (see
    ``_resolve_task`` in language_render.py).
  * module_2.max_interjections_per_episode: 3 (default) → 9 — more
    ``user_interjection_response`` training samples + more plan
    refresh events.
  * module_3.K: 1 → 3 — three VQA pairs per emission tick instead of
    one. Combined with the hz bump below, ~6× more VQA samples.
  * module_3.vqa_emission_hz: 1.0 → 2.0 — double the VQA emission
    rate within each subtask span.

Pushes to a new hub repo (``_tool3``) so the working ``_tool2``
dataset stays intact for comparison. ``${task}`` already wired to
rotate through ``task_aug`` rows, so no renderer change needed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotation/run_hf_job.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/examples/annotation/run_hf_job.py b/examples/annotation/run_hf_job.py
index 672d9f368..b69f66ed1 100644
--- a/examples/annotation/run_hf_job.py
+++ b/examples/annotation/run_hf_job.py
@@ -23,6 +23,18 @@ token = os.environ.get("HF_TOKEN") or get_token()
 if not token:
     raise RuntimeError("No HF token. Run `huggingface-cli login` or `export HF_TOKEN=hf_...`")
 
+# --- Diversity knobs (Pi0.7-style prompt expansion) -----------------------
+# Bumped roughly 3x across the board to fight memorization on small datasets.
+# A single dataset trained for many epochs with deterministic atom wording
+# converges to perfect recall on training prompts but produces JSON-token
+# garbage at inference for any wording that drifts slightly. More atom
+# variants per episode + higher sampling temperature widens the training
+# distribution so the model has to actually use its language head, not
+# just memorize.
+#
+# Pushes to a *new* hub repo (``_tool3``) so the previous annotation pass
+# (``_tool2``) stays intact — re-train from scratch on the new dataset and
+# compare loss-curve shapes to verify the diversity bump is doing something.
 CMD = (
     "apt-get update -qq && apt-get install -y -qq git ffmpeg && "
     "pip install --no-deps "
@@ -43,6 +55,7 @@ CMD = (
     "--vlm.serve_ready_timeout_s=1800 "
     "--vlm.client_concurrency=256 "
     "--vlm.max_new_tokens=512 "
+    "--vlm.temperature=0.7 "
     "--executor.episode_parallelism=32 "
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
     "--vlm.camera_key=observation.images.wrist "
@@ -50,10 +63,11 @@ CMD = (
     "--module_1.use_video_url=true "
     "--module_1.use_video_url_fps=1.0 "
     "--module_1.derive_task_from_video=always "
-    "--module_1.n_task_rephrasings=10 "
-    "--module_3.K=1 "
-    "--module_3.vqa_emission_hz=1.0 "
-    "--push_to_hub=pepijn223/super_poulain_full_tool2"
+    "--module_1.n_task_rephrasings=30 "
+    "--module_2.max_interjections_per_episode=9 "
+    "--module_3.K=3 "
+    "--module_3.vqa_emission_hz=2.0 "
+    "--push_to_hub=pepijn223/super_poulain_full_tool3"
 )
 
 job = run_job(