small fix

This commit is contained in:
Pepijn
2025-11-18 13:47:05 +01:00
parent cf0f878dbb
commit 3c9149e909
@@ -43,7 +43,7 @@ Usage:
pip install transformers torch qwen-vl-utils accelerate
# Annotate and push to hub:
python subtask_annotation_local.py \\
python subtask_annotation.py \\
--repo-id pepijn223/mydataset \\
--subtasks "reach,grasp,lift,place" \\
--video-key observation.images.base \\
@@ -65,7 +65,7 @@ from rich.console import Console
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn
from rich.tree import Tree
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
from lerobot.datasets.lerobot_dataset import LeRobotDataset
@@ -158,7 +158,7 @@ class VideoAnnotator:
self.console.print(f"[cyan]Loading model: {model_name}...[/cyan]")
# Load model and processor
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
self.model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=torch_dtype,
device_map=device,
@@ -743,15 +743,15 @@ def main():
epilog="""
Examples:
# List available cameras:
python subtask_annotation_local.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --max-episodes 0
python subtask_annotation.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --max-episodes 0
# Annotate with specific camera:
python subtask_annotation_local.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --video-key observation.images.top --push-to-hub
python subtask_annotation.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --video-key observation.images.top --push-to-hub
# Use smaller model (7B instead of 30B):
python subtask_annotation_local.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --video-key observation.images.top --model Qwen/Qwen2-VL-7B-Instruct --push-to-hub
# Use custom model:
python subtask_annotation.py --repo-id pepijn223/mydataset --subtasks "reach,grasp" --video-key observation.images.top --model Qwen/Qwen3-VL-30B-A3B-Instruct --push-to-hub
Note: The 7B model requires ~16GB VRAM. Use 2B model (~8GB VRAM) if needed.
Note: The 30B model requires ~60GB VRAM. Make sure you have sufficient GPU memory.
"""
)
parser.add_argument(
@@ -783,7 +783,7 @@ Note: The 7B model requires ~16GB VRAM. Use 2B model (~8GB VRAM) if needed.
"--model",
type=str,
default="Qwen/Qwen3-VL-30B-A3B-Instruct",
help="Model to use (default: Qwen/Qwen3-VL-30B-A3B-Instruct). Other options: Qwen/Qwen2-VL-2B-Instruct, Qwen/Qwen2-VL-7B-Instruct",
help="Qwen3-VL model to use (default: Qwen/Qwen3-VL-30B-A3B-Instruct)",
)
parser.add_argument(
"--skip-existing",