From 4503019d18b1a80dbb7980ef12d30647d519c6db Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Mon, 9 Feb 2026 10:55:22 +0100 Subject: [PATCH] clean subtask --- .../annotations/subtask_annotate.py | 142 +++++++++++------- src/lerobot/utils/constants.py | 34 +++++ 2 files changed, 125 insertions(+), 51 deletions(-) diff --git a/src/lerobot/data_processing/annotations/subtask_annotate.py b/src/lerobot/data_processing/annotations/subtask_annotate.py index 73d763e9c..f296cfea1 100644 --- a/src/lerobot/data_processing/annotations/subtask_annotate.py +++ b/src/lerobot/data_processing/annotations/subtask_annotate.py @@ -81,6 +81,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn from lerobot.datasets.dataset_tools import add_features from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.utils.constants import SKILL_SEGMENTATION_PROMPT_TEMPLATE # Skill Annotation Data Structures @@ -141,7 +142,11 @@ class BaseVLM(ABC): @abstractmethod def segment_skills( - self, video_path: Path, episode_duration: float, coarse_goal: str | None = None + self, + video_path: Path, + episode_duration: float, + coarse_goal: str | None = None, + subtask_labels: list[str] | None = None, ) -> list[Skill]: """ Segment a video into atomic skills. @@ -150,6 +155,7 @@ class BaseVLM(ABC): video_path: Path to the video file episode_duration: Total duration of the episode in seconds coarse_goal: Optional high-level task description + subtask_labels: Optional list of allowed skill labels to use Returns: List of Skill objects representing atomic manipulation skills @@ -158,7 +164,11 @@ class BaseVLM(ABC): @abstractmethod def segment_skills_batch( - self, video_paths: list[Path], episode_durations: list[float], coarse_goal: str | None = None + self, + video_paths: list[Path], + episode_durations: list[float], + coarse_goal: str | None = None, + subtask_labels: list[str] | None = None, ) -> list[list[Skill]]: """ Segment multiple videos into atomic skills in a single batch. @@ -167,6 +177,7 @@ class BaseVLM(ABC): video_paths: List of paths to video files episode_durations: List of episode durations in seconds coarse_goal: Optional high-level task description + subtask_labels: Optional list of allowed skill labels to use Returns: List of skill lists, one for each video @@ -174,43 +185,36 @@ class BaseVLM(ABC): pass -def create_skill_segmentation_prompt(coarse_goal: str | None = None) -> str: - """Create the prompt for skill segmentation.""" +def create_skill_segmentation_prompt( + coarse_goal: str | None = None, + subtask_labels: list[str] | None = None, +) -> str: + """Create the prompt for skill segmentation. + + Args: + coarse_goal: Optional high-level task description. + subtask_labels: Optional list of allowed skill/subtask labels. When provided, + the model is instructed to use only these labels (choosing the best match + for each segment). + + Returns: + The formatted prompt string. + """ goal_context = f'The overall goal is: "{coarse_goal}"\n\n' if coarse_goal else "" - - return textwrap.dedent(f"""\ - # Role - You are a Robotics Vision System specializing in temporal action segmentation for robot manipulation demonstrations. - - # Task - {goal_context}Segment this robot demonstration video into short atomic manipulation skills. Each skill should: - - Last approximately 1-3 seconds - - Describe a clear, single action (e.g., "pick up object", "move arm left", "release gripper") - - Have precise start and end timestamps - - # Requirements - 1. **Atomic Actions**: Each skill should be a single, indivisible action - 2. **Complete Coverage**: Skills must cover the entire video duration with no gaps - 3. **Boundary Consistency**: The end of one skill equals the start of the next - 4. **Natural Language**: Use clear, descriptive names for each skill - 5. **Timestamps**: Use seconds (float) for all timestamps - - - - # Output Format - After your analysis, output ONLY valid JSON with this exact structure: - - ```json - {{ - "skills": [ - {{"name": "skill description", "start": 0.0, "end": 1.5}}, - {{"name": "another skill", "start": 1.5, "end": 3.2}} - ] - }} - ``` - - The first skill must start at 0.0 and the last skill must end at the video duration. - """) + if subtask_labels: + labels_str = ", ".join(f'"{label}"' for label in subtask_labels) + subtask_labels_section = ( + f'6. **Allowed labels**: Use ONLY the following skill names ' + f"(choose the best match for each segment): {labels_str}\n\n" + ) + else: + subtask_labels_section = "" + return textwrap.dedent( + SKILL_SEGMENTATION_PROMPT_TEMPLATE.format( + goal_context=goal_context, + subtask_labels_section=subtask_labels_section, + ) + ) # Qwen2-VL Implementation @@ -238,10 +242,14 @@ class Qwen2VL(BaseVLM): self.console.print(f"[green]✓ Model loaded successfully on {device}[/green]") def segment_skills( - self, video_path: Path, episode_duration: float, coarse_goal: str | None = None + self, + video_path: Path, + episode_duration: float, + coarse_goal: str | None = None, + subtask_labels: list[str] | None = None, ) -> list[Skill]: """Segment video into skills using Qwen2-VL.""" - prompt = create_skill_segmentation_prompt(coarse_goal) + prompt = create_skill_segmentation_prompt(coarse_goal, subtask_labels) duration_str = f"{int(episode_duration // 60):02d}:{int(episode_duration % 60):02d}" messages = [ @@ -279,10 +287,14 @@ class Qwen2VL(BaseVLM): return self._parse_skills_response(response) def segment_skills_batch( - self, video_paths: list[Path], episode_durations: list[float], coarse_goal: str | None = None + self, + video_paths: list[Path], + episode_durations: list[float], + coarse_goal: str | None = None, + subtask_labels: list[str] | None = None, ) -> list[list[Skill]]: """Segment multiple videos into skills using Qwen2-VL in a batch.""" - prompt = create_skill_segmentation_prompt(coarse_goal) + prompt = create_skill_segmentation_prompt(coarse_goal, subtask_labels) # Create messages for each video all_messages = [] @@ -394,10 +406,14 @@ class Qwen3VL(BaseVLM): self.console.print(f"[green]✓ Model loaded successfully on {device}[/green]") def segment_skills( - self, video_path: Path, episode_duration: float, coarse_goal: str | None = None + self, + video_path: Path, + episode_duration: float, + coarse_goal: str | None = None, + subtask_labels: list[str] | None = None, ) -> list[Skill]: """Segment video into skills using Qwen3-VL.""" - prompt = create_skill_segmentation_prompt(coarse_goal) + prompt = create_skill_segmentation_prompt(coarse_goal, subtask_labels) duration_str = f"{int(episode_duration // 60):02d}:{int(episode_duration % 60):02d}" messages = [ @@ -435,10 +451,14 @@ class Qwen3VL(BaseVLM): return self._parse_skills_response(response) def segment_skills_batch( - self, video_paths: list[Path], episode_durations: list[float], coarse_goal: str | None = None + self, + video_paths: list[Path], + episode_durations: list[float], + coarse_goal: str | None = None, + subtask_labels: list[str] | None = None, ) -> list[list[Skill]]: """Segment multiple videos into skills using Qwen3-VL in a batch.""" - prompt = create_skill_segmentation_prompt(coarse_goal) + prompt = create_skill_segmentation_prompt(coarse_goal, subtask_labels) # Create messages for each video all_messages = [] @@ -679,6 +699,7 @@ class SkillAnnotator: video_key: str, episodes: list[int] | None = None, skip_existing: bool = False, + subtask_labels: list[str] | None = None, ) -> dict[int, EpisodeSkills]: """ Annotate all episodes in a dataset with skill labels using batched processing. @@ -688,6 +709,7 @@ class SkillAnnotator: video_key: Key for video observations (e.g., "observation.images.base") episodes: Specific episode indices to annotate (None = all) skip_existing: Skip episodes that already have skill annotations + subtask_labels: Optional list of allowed skill labels (VLM will use only these) Returns: Dictionary mapping episode index to EpisodeSkills @@ -732,7 +754,7 @@ class SkillAnnotator: try: batch_annotations = self._annotate_episodes_batch( - dataset, batch_episodes, video_key, coarse_goal + dataset, batch_episodes, video_key, coarse_goal, subtask_labels ) for ep_idx in batch_episodes: @@ -754,7 +776,9 @@ class SkillAnnotator: # Fallback: process episodes one by one for ep_idx in batch_episodes: try: - skills = self._annotate_episode(dataset, ep_idx, video_key, coarse_goal) + skills = self._annotate_episode( + dataset, ep_idx, video_key, coarse_goal, subtask_labels + ) if skills: annotations[ep_idx] = EpisodeSkills( episode_index=ep_idx, @@ -778,7 +802,9 @@ class SkillAnnotator: for ep_idx, error_msg in list(failed_episodes.items()): self.console.print(f"[cyan]Retry attempt for episode {ep_idx} (previous error: {error_msg})[/cyan]") try: - skills = self._annotate_episode(dataset, ep_idx, video_key, coarse_goal) + skills = self._annotate_episode( + dataset, ep_idx, video_key, coarse_goal, subtask_labels + ) if skills: annotations[ep_idx] = EpisodeSkills( episode_index=ep_idx, @@ -823,6 +849,7 @@ class SkillAnnotator: episode_indices: list[int], video_key: str, coarse_goal: str, + subtask_labels: list[str] | None = None, ) -> dict[int, list[Skill]]: """Annotate multiple episodes with skill labels in a batch.""" # Extract all videos for this batch @@ -863,7 +890,9 @@ class SkillAnnotator: try: # Run VLM skill segmentation in batch - all_skills = self.vlm.segment_skills_batch(extracted_paths, durations, coarse_goal) + all_skills = self.vlm.segment_skills_batch( + extracted_paths, durations, coarse_goal, subtask_labels + ) # Map results back to episode indices results = {} @@ -884,6 +913,7 @@ class SkillAnnotator: episode_index: int, video_key: str, coarse_goal: str, + subtask_labels: list[str] | None = None, ) -> list[Skill]: """Annotate a single episode with skill labels.""" # Get video path and timestamps for this episode @@ -905,7 +935,9 @@ class SkillAnnotator: try: # Run VLM skill segmentation - skills = self.vlm.segment_skills(extracted_path, duration, coarse_goal) + skills = self.vlm.segment_skills( + extracted_path, duration, coarse_goal, subtask_labels + ) return skills finally: # Clean up temporary file @@ -1269,6 +1301,13 @@ def main(): action="store_true", help="Skip episodes that already have annotations", ) + parser.add_argument( + "--subtask-labels", + type=str, + nargs="+", + default=None, + help="Optional list of allowed skill labels (VLM will use only these; space-separated)", + ) # Output options parser.add_argument( @@ -1325,6 +1364,7 @@ def main(): video_key=args.video_key, episodes=args.episodes, skip_existing=args.skip_existing, + subtask_labels=args.subtask_labels, ) # Save annotations diff --git a/src/lerobot/utils/constants.py b/src/lerobot/utils/constants.py index 577a18a9e..edc1762f5 100644 --- a/src/lerobot/utils/constants.py +++ b/src/lerobot/utils/constants.py @@ -92,3 +92,37 @@ LIBERO_KEY_JOINTS_POS = "robot_state/joints/pos" LIBERO_KEY_JOINTS_VEL = "robot_state/joints/vel" LIBERO_KEY_PIXELS_AGENTVIEW = "pixels/agentview_image" LIBERO_KEY_PIXELS_EYE_IN_HAND = "pixels/robot0_eye_in_hand_image" + +# Skill segmentation prompt template for VLM-based subtask annotation +# Placeholders: {goal_context}, {subtask_labels_section} +SKILL_SEGMENTATION_PROMPT_TEMPLATE = """# Role +You are a Robotics Vision System specializing in temporal action segmentation for robot manipulation demonstrations. + +# Task +{goal_context}Segment this robot demonstration video into short atomic manipulation skills. Each skill should: +- Last approximately 1-3 seconds +- Describe a clear, single action (e.g., "pick up object", "move arm left", "release gripper") +- Have precise start and end timestamps + +# Requirements +1. **Atomic Actions**: Each skill should be a single, indivisible action +2. **Complete Coverage**: Skills must cover the entire video duration with no gaps +3. **Boundary Consistency**: The end of one skill equals the start of the next +4. **Natural Language**: Use clear, descriptive names for each skill +5. **Timestamps**: Use seconds (float) for all timestamps +{subtask_labels_section} + +# Output Format +After your analysis, output ONLY valid JSON with this exact structure: + +```json +{{ + "skills": [ + {{"name": "skill description", "start": 0.0, "end": 1.5}}, + {{"name": "another skill", "start": 1.5, "end": 3.2}} + ] +}} +``` + +The first skill must start at 0.0 and the last skill must end at the video duration. +"""