From 1394a6ab5daaa0fc4eacfa5377b0f526d0a2c6bf Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 5 May 2026 14:32:05 +0200 Subject: [PATCH] chore(annotate): bump diversity knobs ~3x to fight memorisation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following Pi0.7 §V (prompt expansion / diverse context conditioning), push more atom variants per episode and higher VLM sampling temperature so the training distribution has enough wording diversity that the LM head is forced to use its parameters rather than memorise specific (prompt, target) pairs. Changes vs prior annotation pass: * vlm.temperature: 0.2 (default) → 0.7 — every Module-1/2/3 call now produces diverse phrasings; same prompt yields different completions across emissions. * module_1.n_task_rephrasings: 10 → 30 — three times as many ``task_aug`` rows in language_persistent. ``${task}`` already rotates through them deterministically per sample_idx (see ``_resolve_task`` in language_render.py). * module_2.max_interjections_per_episode: 3 (default) → 9 — more ``user_interjection_response`` training samples + more plan refresh events. * module_3.K: 1 → 3 — three VQA pairs per emission tick instead of one. Combined with the hz bump below, ~6× more VQA samples. * module_3.vqa_emission_hz: 1.0 → 2.0 — double the VQA emission rate within each subtask span. Pushes to a new hub repo (``_tool3``) so the working ``_tool2`` dataset stays intact for comparison. ``${task}`` already wired to rotate through ``task_aug`` rows, so no renderer change needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/annotation/run_hf_job.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/examples/annotation/run_hf_job.py b/examples/annotation/run_hf_job.py index 672d9f368..b69f66ed1 100644 --- a/examples/annotation/run_hf_job.py +++ b/examples/annotation/run_hf_job.py @@ -23,6 +23,18 @@ token = os.environ.get("HF_TOKEN") or get_token() if not token: raise RuntimeError("No HF token. Run `huggingface-cli login` or `export HF_TOKEN=hf_...`") +# --- Diversity knobs (Pi0.7-style prompt expansion) ----------------------- +# Bumped roughly 3x across the board to fight memorization on small datasets. +# A single dataset trained for many epochs with deterministic atom wording +# converges to perfect recall on training prompts but produces JSON-token +# garbage at inference for any wording that drifts slightly. More atom +# variants per episode + higher sampling temperature widens the training +# distribution so the model has to actually use its language head, not +# just memorize. +# +# Pushes to a *new* hub repo (``_tool3``) so the previous annotation pass +# (``_tool2``) stays intact — re-train from scratch on the new dataset and +# compare loss-curve shapes to verify the diversity bump is doing something. CMD = ( "apt-get update -qq && apt-get install -y -qq git ffmpeg && " "pip install --no-deps " @@ -43,6 +55,7 @@ CMD = ( "--vlm.serve_ready_timeout_s=1800 " "--vlm.client_concurrency=256 " "--vlm.max_new_tokens=512 " + "--vlm.temperature=0.7 " "--executor.episode_parallelism=32 " "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' " "--vlm.camera_key=observation.images.wrist " @@ -50,10 +63,11 @@ CMD = ( "--module_1.use_video_url=true " "--module_1.use_video_url_fps=1.0 " "--module_1.derive_task_from_video=always " - "--module_1.n_task_rephrasings=10 " - "--module_3.K=1 " - "--module_3.vqa_emission_hz=1.0 " - "--push_to_hub=pepijn223/super_poulain_full_tool2" + "--module_1.n_task_rephrasings=30 " + "--module_2.max_interjections_per_episode=9 " + "--module_3.K=3 " + "--module_3.vqa_emission_hz=2.0 " + "--push_to_hub=pepijn223/super_poulain_full_tool3" ) job = run_job(