From aa749d4947e4a69f392ce0199ee51ef9faf237cb Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 5 May 2026 15:07:18 +0200 Subject: [PATCH] chore(annotate): throttle Module 3 + executor parallelism to fix vLLM stall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Last bump combined ``module_3.K=3`` with ``vqa_emission_hz=2.0`` and ``executor.episode_parallelism=32``. With 2 cameras per dataset that produced ~12× the original VQA call volume, all submitted concurrently. Module 3 latency went from ~30s/phase to ~490s per episode, vLLM's KV cache pegged at 94% with 800+ in-flight requests, and the multimodal cache corrupted with ``AssertionError: Expected a cached item for mm_hash='...'`` (a known vLLM bug under image-heavy concurrency). Module 1 and 2 ran fine; Module 3 was the bottleneck. Pull back the multipliers to land in a sustainable spot: * module_3.K: 3 (kept) — three diverse questions per emission, where the diversity actually helps the LM head. * module_3.vqa_emission_hz: 2.0 → 1.0 — back to the original emission rate. Net VQA volume is now ~3× original (K alone) on a single camera, ~6× across both cameras — manageable. * module_2.max_interjections_per_episode: 9 → 6 — still 2× the default, fewer than the prior 3× to keep total request volume in check. * vlm.client_concurrency: 256 → 128 — gives vLLM headroom on the multimodal request path so the mm_cache doesn't desync. * executor.episode_parallelism: 32 → 16 — half the episodes in flight at once, so peak vLLM load is ~half. n_task_rephrasings stays at 30 (text-only, doesn't load the image path) and vlm.temperature stays at 0.7. The diversity gains are preserved; only the throughput knobs come down. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/annotation/run_hf_job.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/annotation/run_hf_job.py b/examples/annotation/run_hf_job.py index b69f66ed1..49af5da16 100644 --- a/examples/annotation/run_hf_job.py +++ b/examples/annotation/run_hf_job.py @@ -53,10 +53,10 @@ CMD = ( "--tensor-parallel-size 1 --max-model-len 32768 " '--gpu-memory-utilization 0.8 --uvicorn-log-level warning --port {port}" ' "--vlm.serve_ready_timeout_s=1800 " - "--vlm.client_concurrency=256 " + "--vlm.client_concurrency=128 " "--vlm.max_new_tokens=512 " "--vlm.temperature=0.7 " - "--executor.episode_parallelism=32 " + "--executor.episode_parallelism=16 " "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' " "--vlm.camera_key=observation.images.wrist " "--module_1.frames_per_second=1.0 " @@ -64,9 +64,9 @@ CMD = ( "--module_1.use_video_url_fps=1.0 " "--module_1.derive_task_from_video=always " "--module_1.n_task_rephrasings=30 " - "--module_2.max_interjections_per_episode=9 " + "--module_2.max_interjections_per_episode=6 " "--module_3.K=3 " - "--module_3.vqa_emission_hz=2.0 " + "--module_3.vqa_emission_hz=1.0 " "--push_to_hub=pepijn223/super_poulain_full_tool3" )