feat(annotate): forward chat_template_kwargs to OpenAI extra_body

Lets callers pass per-request template flags such as {"enable_thinking": false} for Qwen3.5/Qwen3.6 models, where the default thinking preamble otherwise consumes the entire max_new_tokens budget before any JSON is emitted. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 14:49:43 +00:00 · 2026-04-29 15:00:23 +02:00
parent 824aac9ad8
commit bda829af2f
2 changed files with 13 additions and 3 deletions
@@ -18,6 +18,7 @@ from __future__ import annotations

 from dataclasses import dataclass, field
 from pathlib import Path
+from typing import Any


@dataclass
@@ -140,6 +141,12 @@ class VlmConfig:
    camera_key: str | None = None
    """Override the camera stream used for keyframe attachment. ``None`` picks
    the first ``observation.images.*`` key the dataset declares."""
+    chat_template_kwargs: dict[str, Any] | None = None
+    """Forwarded as ``extra_body.chat_template_kwargs`` on every chat call.
+    Use this to pass model-specific template flags such as
+    ``{"enable_thinking": false}`` for Qwen3.5/Qwen3.6 to suppress the
+    reasoning preamble that otherwise eats the entire ``max_new_tokens``
+    budget before any JSON is emitted."""


@dataclass
@@ -380,10 +380,13 @@ def _make_openai_client(config: VlmConfig) -> VlmClient:
            "max_tokens": max_tok,
            "temperature": temp,
        }
+        extra_body: dict[str, Any] = {}
        if send_mm_kwargs and mm_kwargs:
-            kwargs["extra_body"] = {
-                "mm_processor_kwargs": {**mm_kwargs, "do_sample_frames": True}
-            }
+            extra_body["mm_processor_kwargs"] = {**mm_kwargs, "do_sample_frames": True}
+        if config.chat_template_kwargs:
+            extra_body["chat_template_kwargs"] = config.chat_template_kwargs
+        if extra_body:
+            kwargs["extra_body"] = extra_body
        with rr_lock:
            chosen = clients[rr_counter["i"] % len(clients)]
            rr_counter["i"] += 1