Add voice example

2026-07-23 17:56:07 +00:00 · 2025-12-17 16:31:25 +01:00
parent 8e05dc9a7a
commit b229e7df28
2 changed files with 383 additions and 0 deletions
@@ -0,0 +1,47 @@
 # Voice Assistant Examples
 Voice-enabled robot assistant examples using speech-to-text (STT), and text-to-speech (TTS).
 ## Overview
 These examples demonstrate how to build a voice interface for robot control:
 1. **Hold SPACE** → Push-to-talk recording starts
 2. **Release SPACE** → Recording stops
 3. **STT (Whisper)** → Converts speech to text (high-level task prompt)
 4. **Pi0.5** → Generates robot response/utterance
 5. **TTS (Kokoro)** → Speaks the response back
 ## Requirements
 ```bash
 pip install torch transformers sounddevice numpy pynput kokoro>=0.9.2
 ```
 ## Usage
 ### With Pi0.5 Model
 ```bash
 python examples/voice_assistant/voice_assistant_pi05.py \
    --pretrained_path path/to/pi05/checkpoint
 ```
 ## How It Works
 ### Pi0.5 Voice Integration
 Pi0.5 can generate robot utterances as part of its subtask prediction. The flow:
 1. **High-level prompt**: User voice command is transcribed and formatted as a task prompt
 2. **Subtask generation**: Pi0.5 autoregressively generates a response
 3. **Utterance extraction**: If the response contains `<utterance>...</utterance>` tags, the content is extracted
 4. **TTS output**: The response is spoken back to the user
 ## Configuration Options
 | Option | Default | Description |
 |--------|---------|-------------|
 | `--pretrained_path` | None | Path to Pi0.5 checkpoint |
 | `--record_seconds` | 5.0 | Audio recording duration |
 | `--max_response_tokens` | 100 | Max tokens in generated response |
@@ -0,0 +1,336 @@
 #!/usr/bin/env python
 """
 Voice Assistant with Pi0.5: Microphone → STT → Pi0.5 → TTS → Speaker
 This example demonstrates how to use Pi0.5 as a conversational robot assistant:
 1. Hold SPACE to record your voice command
 2. Speech-to-text (Whisper) converts speech to text
 3. Text is fed as a high-level prompt to Pi0.5
 4. Pi0.5 generates a response (robot utterance)
 5. Text-to-speech (Kokoro) speaks the response back
 Requirements:
    pip install torch transformers sounddevice numpy pynput kokoro>=0.9.2
 Usage:
    python examples/voice_assistant/voice_assistant_pi05.py \
        --pretrained_path lerobot/pi0.5-base
 """
 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import argparse
 import re
 import subprocess
 import threading
 import time
 import numpy as np
 import sounddevice as sd
 import torch
 from pynput import keyboard
 from transformers import AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor
 from lerobot.policies.pi05.configuration_pi05 import PI05Config
 from lerobot.policies.pi05.modeling_pi05 import PI05Pytorch
 SAMPLE_RATE = 16000
 def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")
 class Pi05VoiceAssistant:
    """Voice assistant using Pi0.5 for generating robot utterances."""
    def __init__(
        self,
        pretrained_path: str | None = None,
        max_response_tokens: int = 100,
        max_record_seconds: float = 30.0,
    ):
        self.device = get_device()
        self.dtype = torch.float32 if self.device.type == "mps" else torch.bfloat16
        self.max_response_tokens = max_response_tokens
        self.max_record_seconds = max_record_seconds
        # Push-to-talk state
        self._recording = False
        self._audio_chunks: list[np.ndarray] = []
        self._stream: sd.InputStream | None = None
        print(f"Using device: {self.device}")
        self._load_models(pretrained_path)
    def _load_models(self, pretrained_path: str | None):
        print("Loading STT (Whisper tiny)...")
        self.stt_processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
        self.stt_model = WhisperForConditionalGeneration.from_pretrained(
            "openai/whisper-tiny.en", torch_dtype=self.dtype
        ).to(self.device)
        print("Loading Pi0.5 model...")
        self._load_pi05(pretrained_path)
        print("Loading tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained("google/paligemma-3b-pt-224")
        self._load_tts()
        print("Ready!\n")
    def _load_pi05(self, pretrained_path: str | None):
        """Load Pi0.5 model for utterance generation."""
        config = PI05Config()
        config.dtype = "float32" if self.device.type == "mps" else "bfloat16"
        self.pi05_model = PI05Pytorch(config)
        if pretrained_path:
            try:
                from safetensors.torch import load_file
                state_dict = load_file(f"{pretrained_path}/model.safetensors")
                self.pi05_model.load_state_dict(state_dict, strict=False)
                print(f"✓ Loaded Pi0.5 weights from {pretrained_path}")
            except Exception as e:
                print(f"Warning: Could not load pretrained weights: {e}")
                print("Using randomly initialized model for demo purposes")
        self.pi05_model = self.pi05_model.to(self.device)
        self.pi05_model.eval()
    def _load_tts(self):
        try:
            print("Loading TTS (Kokoro 82M)...")
            from kokoro import KPipeline
            self.tts_pipeline = KPipeline(lang_code="a")  # American English
            self.tts_voice = "af_heart"
            self.tts_type = "kokoro"
            print("Kokoro loaded!")
        except Exception as e:
            print(f"Kokoro not available ({e})")
            print("Using macOS `say` for TTS")
            self.tts_pipeline = None
            self.tts_type = "system"
    def _audio_callback(self, indata, frames, time_info, status):
        """Callback for audio stream - collects chunks while recording."""
        if self._recording:
            self._audio_chunks.append(indata.copy())
    def _start_recording(self):
        """Start recording audio."""
        if self._recording:
            return
        self._recording = True
        self._audio_chunks = []
        print("🎤 Recording... (release SPACE to stop)")
    def _stop_recording(self) -> np.ndarray | None:
        """Stop recording and return the audio."""
        if not self._recording:
            return None
        self._recording = False
        if not self._audio_chunks:
            return None
        audio = np.concatenate(self._audio_chunks, axis=0).flatten()
        duration = len(audio) / SAMPLE_RATE
        volume = np.abs(audio).max()
        print(f"Recorded {duration:.1f}s, volume: {volume:.4f}")
        if volume < 0.001:
            print("⚠️  Very low audio - check microphone permissions!")
            return None
        return audio
    def wait_for_spacebar(self) -> np.ndarray | None:
        """Wait for spacebar press, record while held, return audio on release."""
        audio_result = None
        recording_done = threading.Event()
        def on_press(key):
            if key == keyboard.Key.space:
                self._start_recording()
        def on_release(key):
            nonlocal audio_result
            if key == keyboard.Key.space and self._recording:
                audio_result = self._stop_recording()
                recording_done.set()
                return False  # Stop listener
        # Start audio stream
        self._stream = sd.InputStream(
            samplerate=SAMPLE_RATE,
            channels=1,
            dtype="float32",
            callback=self._audio_callback,
            blocksize=int(SAMPLE_RATE * 0.1),  # 100ms blocks
        )
        with self._stream:
            print("\n⏳ Press and hold SPACE to speak...")
            with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
                # Wait for recording to complete or timeout
                recording_done.wait(timeout=self.max_record_seconds)
                if self._recording:
                    audio_result = self._stop_recording()
        return audio_result
    def transcribe(self, audio: np.ndarray) -> str:
        start = time.perf_counter()
        inputs = self.stt_processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
        input_features = inputs.input_features.to(self.device, dtype=self.dtype)
        tokens = self.stt_model.generate(input_features)
        text = self.stt_processor.batch_decode(tokens, skip_special_tokens=True)[0]
        print(f"STT: {time.perf_counter() - start:.2f}s")
        return text.strip()
    def _create_dummy_images(self, batch_size: int = 1) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
        """Create placeholder images for Pi0.5 when no camera is available."""
        image_shape = (batch_size, 3, 224, 224)
        dummy_image = torch.zeros(image_shape, dtype=torch.float32, device=self.device)
        dummy_mask = torch.ones(batch_size, dtype=torch.bool, device=self.device)
        return [dummy_image], [dummy_mask]
    def _tokenize_prompt(self, text: str) -> tuple[torch.Tensor, torch.Tensor]:
        """Tokenize the user prompt for Pi0.5."""
        prompt = f"User request: {text}\nRobot response:"
        tokenized = self.tokenizer(
            [prompt],
            max_length=200,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        tokens = tokenized["input_ids"].to(self.device)
        masks = tokenized["attention_mask"].to(self.device, dtype=torch.bool)
        return tokens, masks
    def generate_response(self, user_text: str) -> str:
        """Generate robot utterance using Pi0.5's language generation."""
        start = time.perf_counter()
        images, img_masks = self._create_dummy_images()
        tokens, masks = self._tokenize_prompt(user_text)
        with torch.no_grad():
            generated_tokens = self.pi05_model._generate_subtask_tokens(
                images=images,
                img_masks=img_masks,
                tokens=tokens,
                masks=masks,
                tokenizer=self.tokenizer,
                max_length=self.max_response_tokens,
                device=self.device,
            )
        # Decode generated tokens
        valid_tokens = generated_tokens[0][generated_tokens[0] != 0]
        response = self.tokenizer.decode(valid_tokens, skip_special_tokens=True)
        # Extract utterance if marked with special tokens
        response = self._extract_utterance(response)
        print(f"Pi0.5: {time.perf_counter() - start:.2f}s")
        return response.strip()
    def _extract_utterance(self, text: str) -> str:
        """Extract utterance from between <utterance> tokens if present."""
        pattern = r"<utterance>(.*?)</utterance>"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            return match.group(1).strip()
        return text
    def speak(self, text: str):
        start = time.perf_counter()
        if self.tts_type == "kokoro":
            generator = self.tts_pipeline(text, voice=self.tts_voice)
            audio_chunks = [audio for _, _, audio in generator]
            if audio_chunks:
                audio = np.concatenate(audio_chunks)
                sd.play(audio, 24000)
                sd.wait()
        else:
            subprocess.run(["say", text], check=True)
        print(f"TTS: {time.perf_counter() - start:.2f}s")
    def run(self):
        print("=" * 50)
        print("Pi0.5 Voice Assistant")
        print("=" * 50)
        print("• Hold SPACE to record your voice command")
        print("• Release SPACE when done speaking")
        print("• Press Ctrl+C to exit")
        print("=" * 50)
        while True:
            try:
                audio = self.wait_for_spacebar()
                if audio is None:
                    print("(no audio captured)\n")
                    continue
                user_text = self.transcribe(audio)
                if not user_text:
                    print("(no speech detected)\n")
                    continue
                print(f"You: {user_text}")
                response = self.generate_response(user_text)
                print(f"Robot: {response}\n")
                self.speak(response)
            except KeyboardInterrupt:
                print("\nGoodbye!")
                break
 def main():
    parser = argparse.ArgumentParser(description="Pi0.5 Voice Assistant")
    parser.add_argument(
        "--pretrained_path",
        type=str,
        default=None,
        help="Path to pretrained Pi0.5 model (optional)",
    )
    parser.add_argument(
        "--max_response_tokens",
        type=int,
        default=100,
        help="Maximum tokens in generated response",
    )
    parser.add_argument(
        "--max_record_seconds",
        type=float,
        default=30.0,
        help="Maximum recording duration in seconds",
    )
    args = parser.parse_args()
    assistant = Pi05VoiceAssistant(
        pretrained_path=args.pretrained_path,
        max_response_tokens=args.max_response_tokens,
        max_record_seconds=args.max_record_seconds,
    )
    assistant.run()
 if __name__ == "__main__":
    main()