diff --git a/examples/annotation/run_hf_job.py b/examples/annotation/run_hf_job.py index 49af5da16..913815153 100644 --- a/examples/annotation/run_hf_job.py +++ b/examples/annotation/run_hf_job.py @@ -23,18 +23,6 @@ token = os.environ.get("HF_TOKEN") or get_token() if not token: raise RuntimeError("No HF token. Run `huggingface-cli login` or `export HF_TOKEN=hf_...`") -# --- Diversity knobs (Pi0.7-style prompt expansion) ----------------------- -# Bumped roughly 3x across the board to fight memorization on small datasets. -# A single dataset trained for many epochs with deterministic atom wording -# converges to perfect recall on training prompts but produces JSON-token -# garbage at inference for any wording that drifts slightly. More atom -# variants per episode + higher sampling temperature widens the training -# distribution so the model has to actually use its language head, not -# just memorize. -# -# Pushes to a *new* hub repo (``_tool3``) so the previous annotation pass -# (``_tool2``) stays intact — re-train from scratch on the new dataset and -# compare loss-curve shapes to verify the diversity bump is doing something. CMD = ( "apt-get update -qq && apt-get install -y -qq git ffmpeg && " "pip install --no-deps " diff --git a/src/lerobot/configs/recipes/pi052_hirobot.yaml b/src/lerobot/configs/recipes/pi052_hirobot.yaml index 40a20387d..c2f80da71 100644 --- a/src/lerobot/configs/recipes/pi052_hirobot.yaml +++ b/src/lerobot/configs/recipes/pi052_hirobot.yaml @@ -1,51 +1,13 @@ -# π0.5 v2 (pi052) — Hi-Robot / MEM / ECoT blend. +# π0.5 v2 (pi052) Hi-Robot blend. # -# Architecturally mirrors ``smolvla2_hirobot.yaml`` — same two -# flavors, same sub-recipes — but the rendered messages are fed -# to PaliGemma (PaliGemma is not chat-pretrained, so the -# ``PI052TextTokenizerStep`` concatenates them as ``Role: content`` -# plain text rather than calling ``apply_chat_template``). -# -# Two flavors -# ----------- -# -# Flavor 1 — ``action_execution`` (~60% weight) -# The main always-on recipe. Fuses all available context -# (task + plan + memory) into a unified user prompt, and -# uses the current subtask as the assistant target. This -# single recipe supervises *both*: -# * subtask prediction (text CE on the assistant span, -# lm_head), and -# * action chunks (flow MSE on the action expert via -# ``stream: low_level, target: true``, plus the FAST -# CE on the action tokens when enabled). -# Pi 0.7 §V.A — subtask in the prompt + flow on actions. -# -# Flavor 2 — event-driven text-only recipes -# ``ask_vqa_*``. Each handles a specific high-level event -# with a TEXT output. ``if_present`` guards keep them from -# firing on frames without the relevant annotation. -# -# Memory updates are folded INTO ``action_execution`` as a -# conditional second target gated on boundary frames — see -# ``smolvla2_hirobot.yaml`` for the rationale. The -# ``user_interjection_response`` recipe was dropped — the -# current datasets don't include interjection / say() annotations. +# Same shape as ``smolvla2_hirobot.yaml`` — see that file for the +# flavor breakdown. The only difference here is the backbone: +# PaliGemma isn't chat-pretrained, so ``PI052TextTokenizerStep`` +# concatenates messages as ``Role: content`` plain text instead +# of calling ``apply_chat_template``. blend: - # ---------------------------------------------------------- - # FLAVOR 1: action_execution (main path) - # - # Bundles memory updates inline. On most frames the binding - # ``new_memory: emitted_at(t, style=memory)`` returns None and - # only the subtask is supervised. On *boundary* frames (the - # exact timestamp a new memory was annotated — i.e. when a - # subtask just completed) the binding fires and the recipe - # supervises the new memory as a follow-up assistant turn, - # with a "Completed subtask: …" user message in between to - # separate the two outputs in the rendered prefix. - # ---------------------------------------------------------- action_execution: weight: 0.85 bindings: @@ -55,17 +17,10 @@ blend: stream: high_level content: "${task}\nPlan: ${plan}\nMemory: ${memory}" - {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask} - # Memory-update tail — only renders at boundary frames where - # ``new_memory`` fires. The new memory is appended as a second - # assistant turn right after the subtask, with no intervening - # user filler: at a subtask boundary the model emits the new - # subtask AND the updated memory in one forward pass. + # Boundary-frame tail: at a subtask transition, predict the + # new memory as a second assistant turn (same forward pass). - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory} - # ---------------------------------------------------------- - # FLAVOR 2: event-driven text-only paths - # ---------------------------------------------------------- - ask_vqa_top: weight: 0.075 bindings: diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml index d96bd168d..8579d9622 100644 --- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml +++ b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml @@ -1,68 +1,13 @@ -# SmolVLA2 canonical training recipe — Hi Robot / MEM / ECoT blend. +# SmolVLA2 Hi-Robot blend — two flavors: # -# Inspired by Pi 0.7 §V (Diversifying the Prompt) and Pi 0.5's -# hierarchical subtask training. The blend has **two flavors**: -# -# Flavor 1 — ``action_execution`` (~60% weight) -# The main always-on recipe. Fuses all available context -# (task + plan + memory) into a unified user prompt, and -# uses the current subtask as the assistant target. This -# single recipe supervises *both*: -# * subtask prediction (text CE on the assistant span, -# lm_head), and -# * action chunks (flow MSE on the action expert via -# ``stream: low_level, target: true``, plus the FAST -# CE on the action tokens when enabled). -# At inference, the same prompt structure is used: -# * the high-level loop calls ``select_message`` with the -# user prompt only → generates the next subtask. -# * the low-level loop calls ``predict_action_chunk`` with -# the user prompt + the generated subtask as the -# assistant turn → generates the action chunk. -# Replaces what used to be three separate recipes -# (``high_level_subtask`` + ``low_level_execution`` + the -# implicit subtask-in-prompt context) in earlier drafts. -# Pi 0.7's §V.A "Subtask instructions" pattern. -# -# Flavor 2 — event-driven text-only recipes -# Each handles a specific high-level event with a TEXT -# output (no action supervision). They fire when the -# binding for the event resolves to non-None: -# * ``ask_vqa_top`` / ``ask_vqa_wrist``: answer a -# camera-grounded visual question. -# All use ``stream: high_level`` (no flow loss) and rely on -# ``if_present`` guards so they only fire on frames where -# the relevant event annotation is present. -# -# ``memory_update`` is folded into Flavor 1 (gated on the -# ``new_memory`` binding at boundary frames). -# ``user_interjection_response`` was dropped — the current -# datasets don't include interjection / say() annotations. -# -# How the chat tokenizer interprets the flavor split -# --------------------------------------------------- -# * predict_actions = bool(targets_by_stream.get("low_level")) -# → True only for Flavor 1 (action_execution). -# * text_labels supervises whatever assistant turns are marked -# target=true. For action_execution, this is the subtask -# string. For Flavor 2, it's the corresponding text output. +# 1. action_execution — fused (task + plan + memory) prompt; +# supervises the current subtask (low_level: flow + text CE) +# and, at memory-boundary frames, the new memory too. +# 2. ask_vqa_{top,wrist} — text-only VQA on a camera image, +# gated by ``if_present`` so they only fire on annotated frames. blend: - # ---------------------------------------------------------- - # FLAVOR 1: action_execution (main path) - # - # Bundles memory updates inline. On most frames the binding - # ``new_memory: emitted_at(t, style=memory)`` returns None and - # only the subtask is supervised. On *boundary* frames (the - # exact timestamp a new memory was annotated — i.e. when a - # subtask just completed) the binding fires and the recipe - # supervises the new memory as a follow-up assistant turn, - # with a "Completed subtask: …" user message in between to - # separate the two outputs in the chat sequence. Mirrors the - # behaviour of the old standalone ``memory_update`` recipe - # but keeps everything inside the unified action_execution. - # ---------------------------------------------------------- action_execution: weight: 0.85 bindings: @@ -72,17 +17,10 @@ blend: stream: high_level content: "${task}\nPlan: ${plan}\nMemory: ${memory}" - {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask} - # Memory-update tail — only renders at boundary frames where - # ``new_memory`` fires. The new memory is appended as a second - # assistant turn right after the subtask, with no intervening - # user filler: at a subtask boundary the model emits the new - # subtask AND the updated memory in one forward pass. + # Boundary-frame tail: at a subtask transition, predict the + # new memory as a second assistant turn (same forward pass). - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory} - # ---------------------------------------------------------- - # FLAVOR 2: event-driven text-only paths - # ---------------------------------------------------------- - ask_vqa_top: weight: 0.075 bindings: