add qwen 3.5 and fix video extraction

2026-05-15 08:39:49 +00:00 · 2026-03-04 12:22:41 +00:00
parent 51b3b31927
commit 3d55c5e484
3 changed files with 213 additions and 28 deletions
@@ -6,7 +6,7 @@
 # Configuration
 REPO_ID="jadechoghari/piper-demo-20260205_103303"
 # MODEL="Qwen/Qwen3-VL-30B-A3B-Thinking"
-MODEL="Qwen/Qwen2-VL-7B-Instruct"
+MODEL="Qwen/Qwen3.5-27B"
 # or: MODEL="Qwen/Qwen2-VL-7B-Instruct"


@@ -21,12 +21,13 @@ SAMPLE_INTERVAL=5.0  # generate dialogue every 1 second (all episodes processed)
 # Example (add backslash after "$MODEL" and uncomment the next line):
 #   --model "$MODEL" \
 #   --subtask-labels "pick_up_yellow_nut_bar" "pick_up_cake" "pick_up_biscuit_pack" "pick_up_soda_can"
-python /admin/home/jade_choghari/lerobot/src/lerobot/data_processing/annotations/subtask_annotate.py \
+python /home/lerobot/src/lerobot/data_processing/annotations/subtask_annotate.py \
    --repo-id "$REPO_ID" \
    --video-key observation.images.top \
    --output-dir "$OUTPUT_DIR" \
    --output-repo-id "jadechoghari/piper-demo-annotated1" \
    --push-to-hub \
+    --no-timer-overlay \
    --model "$MODEL" \
    --subtask-labels "pick_up_yellow_nut_bar" "pick_up_cake" "pick_up_biscuit_pack" "pick_up_soda_can" \
    --batch-size 2
@@ -34,6 +34,7 @@ separate subtask hierarchy while preserving the original task annotations.
 Supported VLMs (modular design allows easy extension):
 - Qwen2-VL (default): "Qwen/Qwen2-VL-7B-Instruct"
 - Qwen3-VL: "Qwen/Qwen3-VL-30B-A3B-Instruct"
+- Qwen3.5-VL: "Qwen/Qwen3.5-27B", "Qwen/Qwen3-VL-8B-Instruct" (Qwen3_5ForConditionalGeneration)

 Usage:
 ```bash
@@ -581,6 +582,169 @@ class Qwen3VL(BaseVLM):
        raise ValueError(f"Could not parse skills from response: {response[:200]}...")


+# Qwen3.5-VL Implementation (Qwen3_5ForConditionalGeneration)
+
+
+class Qwen3_5VL(BaseVLM):
+    """Qwen3.5-VL model for skill segmentation (Qwen3_5ForConditionalGeneration)."""
+
+    def __init__(self, model_name: str, device: str = "cuda", torch_dtype: torch.dtype = torch.bfloat16):
+        from qwen_vl_utils import process_vision_info
+        from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
+
+        self.console = Console()
+        self.device = device
+        self.model_name = model_name
+        self.process_vision_info = process_vision_info
+
+        self.console.print(f"[cyan]Loading Qwen3.5-VL model: {model_name}...[/cyan]")
+
+        self.model = Qwen3_5ForConditionalGeneration.from_pretrained(
+            model_name, torch_dtype=torch_dtype, device_map=device, trust_remote_code=True
+        )
+        self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+        self.processor.tokenizer.padding_side = "left"
+        self.console.print(f"[green]✓ Model loaded successfully on {device}[/green]")
+
+    def segment_skills(
+        self,
+        video_path: Path,
+        episode_duration: float,
+        coarse_goal: str | None = None,
+        subtask_labels: list[str] | None = None,
+    ) -> list[Skill]:
+        """Segment video into skills using Qwen3.5-VL."""
+        prompt = create_skill_segmentation_prompt(
+            coarse_goal, subtask_labels, duration_seconds=episode_duration
+        )
+        duration_str = f"{int(episode_duration // 60):02d}:{int(episode_duration % 60):02d}"
+        messages = [
+            {"role": "system", "content": [{"type": "text", "text": prompt}]},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video": str(video_path), "fps": 1.0},
+                    {
+                        "type": "text",
+                        "text": f"Video duration: {duration_str} (exactly {episode_duration:.1f} seconds). Segment into atomic skills. Last skill must end at {episode_duration:.1f}.",
+                    },
+                ],
+            },
+        ]
+
+        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = self.process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        ).to(self.device)
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
+
+        response = self.processor.batch_decode(
+            [out[len(inp) :] for inp, out in zip(inputs.input_ids, generated_ids)],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0].strip()
+
+        return self._parse_skills_response(response)
+
+    def segment_skills_batch(
+        self,
+        video_paths: list[Path],
+        episode_durations: list[float],
+        coarse_goal: str | None = None,
+        subtask_labels: list[str] | None = None,
+    ) -> list[list[Skill]]:
+        """Segment multiple videos into skills using Qwen3.5-VL in a batch."""
+        all_messages = []
+        for video_path, duration in zip(video_paths, episode_durations):
+            prompt = create_skill_segmentation_prompt(
+                coarse_goal, subtask_labels, duration_seconds=duration
+            )
+            duration_str = f"{int(duration // 60):02d}:{int(duration % 60):02d}"
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": prompt}]},
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "video": str(video_path), "fps": 1.0},
+                        {
+                            "type": "text",
+                            "text": f"Video duration: {duration_str} (exactly {duration:.1f} seconds). Segment into atomic skills. Last skill must end at {duration:.1f}.",
+                        },
+                    ],
+                },
+            ]
+            all_messages.append(messages)
+
+        all_texts = []
+        all_image_inputs = []
+        all_video_inputs = []
+
+        for messages in all_messages:
+            text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
+            image_inputs, video_inputs = self.process_vision_info(messages)
+            all_texts.append(text)
+            all_image_inputs.extend(image_inputs or [])
+            all_video_inputs.extend(video_inputs or [])
+
+        inputs = self.processor(
+            text=all_texts,
+            images=all_image_inputs if all_image_inputs else None,
+            videos=all_video_inputs if all_video_inputs else None,
+            padding=True,
+            return_tensors="pt",
+        ).to(self.device)
+
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
+
+        responses = self.processor.batch_decode(
+            [out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        breakpoint()
+
+        all_skills = []
+        for idx, response in enumerate(responses):
+            try:
+                skills = self._parse_skills_response(response.strip())
+                if not skills:
+                    self.console.print(f"[yellow]Warning: No skills parsed from response for video {idx}[/yellow]")
+                all_skills.append(skills)
+            except Exception as e:
+                self.console.print(f"[yellow]Warning: Failed to parse response for video {idx}: {e}[/yellow]")
+                all_skills.append([])
+
+        return all_skills
+
+    def _parse_skills_response(self, response: str) -> list[Skill]:
+        """Parse the VLM response into Skill objects."""
+        if "```json" in response:
+            response = response.split("```json")[1].split("```")[0]
+        elif "```" in response:
+            response = response.split("```")[1].split("```")[0]
+
+        try:
+            data = json.loads(response)
+            skills_data = data.get("skills", data)
+            if isinstance(skills_data, list):
+                return [Skill.from_dict(s) for s in skills_data]
+        except json.JSONDecodeError:
+            match = re.search(r"\{.*\}", response, re.DOTALL)
+            if match:
+                data = json.loads(match.group())
+                skills_data = data.get("skills", [])
+                return [Skill.from_dict(s) for s in skills_data]
+
+        raise ValueError(f"Could not parse skills from response: {response[:200]}...")
+
+
 # VLM Registry - Add new VLMs here

 VLM_REGISTRY: dict[str, type[BaseVLM]] = {
@@ -590,6 +754,9 @@ VLM_REGISTRY: dict[str, type[BaseVLM]] = {
    "Qwen/Qwen2-VL-72B-Instruct": Qwen2VL,
    # Qwen3-VL variants (MoE)
    "Qwen/Qwen3-VL-30B-A3B-Instruct": Qwen3VL,
+    # Qwen3.5-VL (Qwen3_5ForConditionalGeneration)
+    "Qwen/Qwen3.5-27B": Qwen3_5VL,
+    "Qwen/Qwen3-VL-8B-Instruct": Qwen3_5VL,
 }


@@ -614,6 +781,8 @@ def get_vlm(model_name: str, device: str = "cuda", torch_dtype: torch.dtype = to

    # Check for partial matches (e.g., "qwen2" in model name)
    model_lower = model_name.lower()
+    if "qwen3.5" in model_lower:
+        return Qwen3_5VL(model_name, device, torch_dtype)
    if "qwen3" in model_lower:
        return Qwen3VL(model_name, device, torch_dtype)
    elif "qwen2" in model_lower or "qwen-vl" in model_lower:
@@ -702,43 +871,65 @@ class VideoExtractor:
        """
        Add a visible timer overlay to each frame (elapsed time in seconds) in one corner.
        Used so the VLM can read the timestamp from the image instead of relying on file metadata.
-        Writes to a new temporary file and returns its path.
+        Draws a black box with white text at top-right. Writes to a new temporary file and returns its path.
        """
        out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
        out_path = Path(out_file.name)
        out_file.close()

        cap = cv2.VideoCapture(str(video_path))
+        if not cap.isOpened():
+            raise RuntimeError("Failed to open video")
+
        fps = cap.get(cv2.CAP_PROP_FPS) or 1.0
        w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        writer = cv2.VideoWriter(str(out_path), fourcc, fps, (w, h))

-        # Font scale so timer is big and readable (proportional to frame size)
+        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = max(1.2, min(h, w) / 350.0)
        thickness = max(2, int(font_scale))
-        font = cv2.FONT_HERSHEY_SIMPLEX
+
+        padding = 15
+        margin = 30

        frame_idx = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break
+
            t_sec = frame_idx / fps
-            text = f"{t_sec:.1f} s"
-            # Position: top-right corner with margin
-            (tw, th), _ = cv2.getTextSize(text, font, font_scale, thickness)
-            x = w - tw - 30
-            y = 25 + th
-            # Black outline for readability on any background (draw in 8 directions then center)
-            for dx in (-2, -1, 0, 1, 2):
-                for dy in (-2, -1, 0, 1, 2):
-                    if dx != 0 or dy != 0:
-                        cv2.putText(
-                            frame, text, (x + dx, y + dy), font, font_scale, (0, 0, 0), thickness
-                        )
-            cv2.putText(frame, text, (x, y), font, font_scale, (255, 255, 255), thickness)
+            text = f"{t_sec:.2f} s"
+
+            (tw, th), baseline = cv2.getTextSize(text, font, font_scale, thickness)
+
+            # Top-right placement
+            x_text = w - tw - margin - padding
+            y_text = margin + th + padding
+
+            # Rectangle coordinates (black box behind text)
+            x1 = x_text - padding
+            y1 = y_text - th - padding
+            x2 = x_text + tw + padding
+            y2 = y_text + baseline + padding
+
+            # Draw black filled rectangle
+            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), -1)
+
+            # Draw white text
+            cv2.putText(
+                frame,
+                text,
+                (x_text, y_text),
+                font,
+                font_scale,
+                (255, 255, 255),
+                thickness,
+                lineType=cv2.LINE_AA,
+            )
+
            writer.write(frame)
            frame_idx += 1

@@ -968,8 +1159,9 @@ class SkillAnnotator:
                
                # Extract episode segment to temporary file
                extracted_path = self.video_extractor.extract_episode_video(
-                    video_path, start_ts, end_ts, target_fps=1
+                    video_path, start_ts, end_ts, target_fps=dataset.meta.fps
                )
+                
                if self.add_timer_overlay:
                    video_for_vlm = self.video_extractor.add_timer_overlay(extracted_path)
                    extracted_paths.append(extracted_path)
@@ -167,18 +167,10 @@ The total video length is **{video_duration_seconds} seconds** ({video_duration_
           - The first subtask always starts at 0.0.
           - The last subtask must end at exactly {video_duration_seconds} (the full video length).
           - **Time is displayed inside the video**: a visible timer in one corner shows the elapsed time in seconds (from 0.0 to the end). Use this on-screen timer to set accurate start and end times for each skill.
-
-        # Step 1 — Textual Timeline (must do this first)
-        First, write a extensive and detailed textual timeline describing what happens in the video with approximate timestamps. **Read the time from the visible timer shown in the video** to get accurate timestamps.
-        For each subtask, include:
-        - its name
-        - an approximate start and end time (from the on-screen timer),
-        - an description of the visual event at the boundary (e.g. "shirt fully folded to the left", "robot rotates folded shirt 90 degrees").
-
        Format this as a bullet list.

 # Output Format
-After your analysis, output ONLY valid JSON with this exact structure. The last skill's "end" MUST be exactly {video_duration_seconds}. Use the timestamps you read from the visible timer in the video:
+output ONLY valid JSON with this exact structure. The last skill's "end" MUST be exactly {video_duration_seconds}. Use the timestamps you read from the visible timer in the video:

 ```json
 {{