From 1fb46ab30057f312d0c885183ecebaa9824ef702 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 2 Jun 2026 16:02:25 +0200 Subject: [PATCH] annotate: cap embedded-frame budget to fit VLM context (fix 32k overflow) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switching the plan module to embedded frames (use_video_url=false) exposed a context overflow: at frames_per_second=2.0 with the old max_video_frames=128 default, a 480x640 episode embeds ~128 frames ≈ 33-39k vision tokens, over the model's 32768 context — every plan call died with 'Input length exceeds maximum context length' (HTTP 400), crashing the whole annotation job. The video_url path never hit this because the server downsampled; the embedded path sends every sampled frame, so the frame count is a hard token budget. Fix: * config default max_video_frames 128 -> 32 (~8-10k vision tokens, comfortable headroom for the prompt + describe/verify passes). Frames are still sampled UNIFORMLY across the whole episode, so longer episodes are subsampled, not truncated — full temporal coverage preserved, just coarser density. * run_hf_job.py: frames_per_second 2.0 -> 1.0, explicit --plan.max_video_frames=32, with a comment explaining the token budget and the 'do not raise toward 128 with embedded frames' rule. Only the plan module embeds the full episode; VQA (1 frame/tick) and interjections (4-frame window) were never at risk. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/annotations/run_hf_job.py | 14 +++++++++++--- .../annotations/steerable_pipeline/config.py | 14 ++++++++++++-- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py index ade582861..e83a56db8 100644 --- a/examples/annotations/run_hf_job.py +++ b/examples/annotations/run_hf_job.py @@ -58,10 +58,18 @@ CMD = ( # handing the server a file:// clip. The embedded path is more # reliable: if clip extraction ever fails, the video_url path would # silently send NO video and the VLM would hallucinate subtasks from - # the task text alone. 2 fps gives dense visual grounding so the VLM - # labels what actually happens. - "--plan.frames_per_second=2.0 " + # the task text alone. + # + # CONTEXT BUDGET: with embedded frames, each frame is ~250-320 vision + # tokens. The model's context is 32768 (see --max-model-len). 32 + # frames sampled uniformly across the episode (~8-10k tokens) fits + # comfortably alongside the prompt and the describe/verify passes. + # Do NOT raise max_video_frames toward 128 with embedded frames — that + # is ~33-39k tokens and overflows the context (BadRequestError 400, + # "Input length exceeds maximum context length"). "--plan.use_video_url=false " + "--plan.frames_per_second=1.0 " + "--plan.max_video_frames=32 " # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the # stove", "Pick the mug...") is authoritative and is what eval uses. # ``derive_task_from_video=off`` keeps that canonical task driving diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index 09c935e66..37371a7fb 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -44,9 +44,19 @@ class PlanConfig: derive_task_from_video: str = "if_short" derive_task_min_words: int = 3 - # Frame sampling for the subtask-decomposition prompt. + # Frame sampling for the subtask-decomposition prompt. Frames are + # sampled uniformly across the whole episode up to ``max_video_frames`` + # (so longer episodes are subsampled, not truncated). + # + # ``max_video_frames`` is a HARD context-budget cap. With the embedded- + # frame path (use_video_url=false), every frame becomes ~250-320 vision + # tokens, so 128 frames ≈ 33-39k tokens — over a 32k-context VLM. 32 + # frames (~8-10k tokens) leaves ample room for the prompt + the + # describe / verify passes. Raise only if your serving context is + # larger AND your episodes need finer temporal resolution; if you hit + # "Input length exceeds maximum context length", lower this. frames_per_second: float = 1.0 - max_video_frames: int = 128 + max_video_frames: int = 32 min_subtask_seconds: float = 1.5 plan_max_steps: int = 8