Files
lerobot/src/lerobot/annotations/steerable_pipeline/config.py
T
Pepijn 663fff0ae2 feat(annotate): Module 1 sees the whole episode as one video block
Replaces keyframe sampling with a single Qwen-VL video block covering
the whole demonstration. The model pools temporally itself and chooses
where to cut subtasks — no stride, no count, no keyframe count knob to
tune.

- frames.py: ``FrameProvider`` gains ``video_for_episode(record,
  max_frames)``; ``VideoFrameProvider`` samples up to ``max_frames``
  uniformly across the episode duration; ``_NullProvider`` returns []
  for the no-video fallback. New ``to_video_block`` helper.
- Module 1: drops keyframe sampling. The subtask prompt now goes out as
  ``[{"type":"video", "video":[<frames>]}, {"type":"text", ...}]`` and
  the prompt template asks the model to "watch the whole clip, then
  segment it" with cut points decided from gripper/contact/regrasp
  events the model sees.
- Module1Config: ``keyframes_per_episode`` removed; replaced with
  ``max_video_frames: int = 32`` (model-capacity bound, not annotation
  logic).
- Test: ``test_module1_attaches_video_block_to_subtask_prompt`` locks in
  the single-video-block invariant.
- Stub-VLM markers updated: tests now key on "atomic subtasks" instead
  of the old "Decompose the demonstration" phrase that no longer
  appears in the prompt.
- Docs: updated to describe the whole-episode video-block behavior and
  the no-video fallback.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 18:48:33 +02:00

120 lines
3.7 KiB
Python

#!/usr/bin/env python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Literal
@dataclass
class Module1Config:
"""Module 1 hyperparameters: plan + subtasks + memory.
Subtask decomposition sees the **whole episode** as one Qwen-VL video
block — no keyframe stride or count: the model handles temporal pooling
itself and decides where to cut. ``max_video_frames`` only caps the
number of frames packed into the video block (a model-capacity bound,
not an annotation-logic knob).
"""
enabled: bool = True
max_video_frames: int = 32
min_subtask_seconds: float = 1.5
plan_max_steps: int = 8
@dataclass
class Module2Config:
"""Module 2 hyperparameters: interjections + paired speech."""
enabled: bool = True
max_interjections_per_episode: int = 1
interjection_min_t: float = 2.0
@dataclass
class Module3Config:
"""Module 3 hyperparameters: general VQA."""
enabled: bool = True
vqa_emission_hz: float = 1.0
K: int = 3
question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
@dataclass
class VlmConfig:
"""Shared Qwen-VL client configuration."""
backend: Literal["vllm", "transformers", "stub"] = "vllm"
model_id: str = "Qwen/Qwen3.6-27B-FP8"
max_new_tokens: int = 512
temperature: float = 0.2
json_mode: bool = True
batch_size: int = 4
tensor_parallel_size: int = 1
camera_key: str | None = None
"""Override the camera stream used for keyframe attachment. ``None`` picks
the first ``observation.images.*`` key the dataset declares."""
@dataclass
class ExecutorConfig:
"""Executor selection and SLURM hyperparameters."""
auto_threshold: int = 32
force_local: bool = False
slurm_partition: str | None = None
slurm_gpus: int = 1
slurm_time: str = "06:00:00"
workers: int = 1
@dataclass
class AnnotationPipelineConfig:
"""Top-level config for ``lerobot-annotate``.
Mirrors the structure of :class:`lerobot.configs.train.TrainPipelineConfig`:
a draccus-parsed dataclass that contains nested per-module sub-configs and
leaves the dataset, executor, and VLM choices independently knobbable.
Output is always in-place: the writer rewrites ``data/chunk-*/file-*.parquet``
in place. Multiple revisions of the same dataset live in separate copies.
"""
repo_id: str | None = None
root: Path | None = None
staging_dir: Path | None = None
"""If unset, defaults to ``<root>/.annotate_staging/``."""
seed: int = 1729
module_1: Module1Config = field(default_factory=Module1Config)
module_2: Module2Config = field(default_factory=Module2Config)
module_3: Module3Config = field(default_factory=Module3Config)
vlm: VlmConfig = field(default_factory=VlmConfig)
executor: ExecutorConfig = field(default_factory=ExecutorConfig)
skip_validation: bool = False
only_episodes: tuple[int, ...] | None = None
def resolved_staging_dir(self, root: Path) -> Path:
return self.staging_dir if self.staging_dir is not None else root / ".annotate_staging"