small fix

2026-05-15 08:39:49 +00:00 · 2025-11-18 13:47:05 +01:00
parent cf0f878dbb
commit 3c9149e909
1 changed files with 9 additions and 9 deletions
@@ -43,7 +43,7 @@ Usage:
 pip install transformers torch qwen-vl-utils accelerate

 # Annotate and push to hub:
-python subtask_annotation_local.py \\
+python subtask_annotation.py \\
  --repo-id pepijn223/mydataset \\
  --subtasks "reach,grasp,lift,place" \\
  --video-key observation.images.base \\
@@ -65,7 +65,7 @@ from rich.console import Console
 from rich.panel import Panel
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from rich.tree import Tree
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor

 from lerobot.datasets.lerobot_dataset import LeRobotDataset

@@ -158,7 +158,7 @@ class VideoAnnotator:
        self.console.print(f"[cyan]Loading model: {model_name}...[/cyan]")
        
        # Load model and processor
-        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+        self.model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
            device_map=device,
@@ -743,15 +743,15 @@ def main():
        epilog="""
 Examples:
  # List available cameras:
-  python subtask_annotation_local.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --max-episodes 0
+  python subtask_annotation.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --max-episodes 0
  
  # Annotate with specific camera:
-  python subtask_annotation_local.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --video-key observation.images.top --push-to-hub
+  python subtask_annotation.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --video-key observation.images.top --push-to-hub
  
-  # Use smaller model (7B instead of 30B):
-  python subtask_annotation_local.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --video-key observation.images.top --model Qwen/Qwen2-VL-7B-Instruct --push-to-hub
+  # Use custom model:
+  python subtask_annotation.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --video-key observation.images.top --model Qwen/Qwen3-VL-30B-A3B-Instruct --push-to-hub

-Note: The 7B model requires ~16GB VRAM. Use 2B model (~8GB VRAM) if needed.
+Note: The 30B model requires ~60GB VRAM. Make sure you have sufficient GPU memory.
 """
    )
    parser.add_argument(
@@ -783,7 +783,7 @@ Note: The 7B model requires ~16GB VRAM. Use 2B model (~8GB VRAM) if needed.
        "--model",
        type=str,
        default="Qwen/Qwen3-VL-30B-A3B-Instruct",
-        help="Model to use (default: Qwen/Qwen3-VL-30B-A3B-Instruct). Other options: Qwen/Qwen2-VL-2B-Instruct, Qwen/Qwen2-VL-7B-Instruct",
+        help="Qwen3-VL model to use (default: Qwen/Qwen3-VL-30B-A3B-Instruct)",
    )
    parser.add_argument(
        "--skip-existing",