Add voice example

This commit is contained in:
Pepijn
2025-12-17 16:31:25 +01:00
parent 8e05dc9a7a
commit b229e7df28
2 changed files with 383 additions and 0 deletions
+47
View File
@@ -0,0 +1,47 @@
# Voice Assistant Examples
Voice-enabled robot assistant examples using speech-to-text (STT), and text-to-speech (TTS).
## Overview
These examples demonstrate how to build a voice interface for robot control:
1. **Hold SPACE** → Push-to-talk recording starts
2. **Release SPACE** → Recording stops
3. **STT (Whisper)** → Converts speech to text (high-level task prompt)
4. **Pi0.5** → Generates robot response/utterance
5. **TTS (Kokoro)** → Speaks the response back
## Requirements
```bash
pip install torch transformers sounddevice numpy pynput kokoro>=0.9.2
```
## Usage
### With Pi0.5 Model
```bash
python examples/voice_assistant/voice_assistant_pi05.py \
--pretrained_path path/to/pi05/checkpoint
```
## How It Works
### Pi0.5 Voice Integration
Pi0.5 can generate robot utterances as part of its subtask prediction. The flow:
1. **High-level prompt**: User voice command is transcribed and formatted as a task prompt
2. **Subtask generation**: Pi0.5 autoregressively generates a response
3. **Utterance extraction**: If the response contains `<utterance>...</utterance>` tags, the content is extracted
4. **TTS output**: The response is spoken back to the user
## Configuration Options
| Option | Default | Description |
|--------|---------|-------------|
| `--pretrained_path` | None | Path to Pi0.5 checkpoint |
| `--record_seconds` | 5.0 | Audio recording duration |
| `--max_response_tokens` | 100 | Max tokens in generated response |
@@ -0,0 +1,336 @@
#!/usr/bin/env python
"""
Voice Assistant with Pi0.5: Microphone → STT → Pi0.5 → TTS → Speaker
This example demonstrates how to use Pi0.5 as a conversational robot assistant:
1. Hold SPACE to record your voice command
2. Speech-to-text (Whisper) converts speech to text
3. Text is fed as a high-level prompt to Pi0.5
4. Pi0.5 generates a response (robot utterance)
5. Text-to-speech (Kokoro) speaks the response back
Requirements:
pip install torch transformers sounddevice numpy pynput kokoro>=0.9.2
Usage:
python examples/voice_assistant/voice_assistant_pi05.py \
--pretrained_path lerobot/pi0.5-base
"""
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import argparse
import re
import subprocess
import threading
import time
import numpy as np
import sounddevice as sd
import torch
from pynput import keyboard
from transformers import AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor
from lerobot.policies.pi05.configuration_pi05 import PI05Config
from lerobot.policies.pi05.modeling_pi05 import PI05Pytorch
SAMPLE_RATE = 16000
def get_device():
if torch.cuda.is_available():
return torch.device("cuda")
elif torch.backends.mps.is_available():
return torch.device("mps")
return torch.device("cpu")
class Pi05VoiceAssistant:
"""Voice assistant using Pi0.5 for generating robot utterances."""
def __init__(
self,
pretrained_path: str | None = None,
max_response_tokens: int = 100,
max_record_seconds: float = 30.0,
):
self.device = get_device()
self.dtype = torch.float32 if self.device.type == "mps" else torch.bfloat16
self.max_response_tokens = max_response_tokens
self.max_record_seconds = max_record_seconds
# Push-to-talk state
self._recording = False
self._audio_chunks: list[np.ndarray] = []
self._stream: sd.InputStream | None = None
print(f"Using device: {self.device}")
self._load_models(pretrained_path)
def _load_models(self, pretrained_path: str | None):
print("Loading STT (Whisper tiny)...")
self.stt_processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
self.stt_model = WhisperForConditionalGeneration.from_pretrained(
"openai/whisper-tiny.en", torch_dtype=self.dtype
).to(self.device)
print("Loading Pi0.5 model...")
self._load_pi05(pretrained_path)
print("Loading tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained("google/paligemma-3b-pt-224")
self._load_tts()
print("Ready!\n")
def _load_pi05(self, pretrained_path: str | None):
"""Load Pi0.5 model for utterance generation."""
config = PI05Config()
config.dtype = "float32" if self.device.type == "mps" else "bfloat16"
self.pi05_model = PI05Pytorch(config)
if pretrained_path:
try:
from safetensors.torch import load_file
state_dict = load_file(f"{pretrained_path}/model.safetensors")
self.pi05_model.load_state_dict(state_dict, strict=False)
print(f"✓ Loaded Pi0.5 weights from {pretrained_path}")
except Exception as e:
print(f"Warning: Could not load pretrained weights: {e}")
print("Using randomly initialized model for demo purposes")
self.pi05_model = self.pi05_model.to(self.device)
self.pi05_model.eval()
def _load_tts(self):
try:
print("Loading TTS (Kokoro 82M)...")
from kokoro import KPipeline
self.tts_pipeline = KPipeline(lang_code="a") # American English
self.tts_voice = "af_heart"
self.tts_type = "kokoro"
print("Kokoro loaded!")
except Exception as e:
print(f"Kokoro not available ({e})")
print("Using macOS `say` for TTS")
self.tts_pipeline = None
self.tts_type = "system"
def _audio_callback(self, indata, frames, time_info, status):
"""Callback for audio stream - collects chunks while recording."""
if self._recording:
self._audio_chunks.append(indata.copy())
def _start_recording(self):
"""Start recording audio."""
if self._recording:
return
self._recording = True
self._audio_chunks = []
print("🎤 Recording... (release SPACE to stop)")
def _stop_recording(self) -> np.ndarray | None:
"""Stop recording and return the audio."""
if not self._recording:
return None
self._recording = False
if not self._audio_chunks:
return None
audio = np.concatenate(self._audio_chunks, axis=0).flatten()
duration = len(audio) / SAMPLE_RATE
volume = np.abs(audio).max()
print(f"Recorded {duration:.1f}s, volume: {volume:.4f}")
if volume < 0.001:
print("⚠️ Very low audio - check microphone permissions!")
return None
return audio
def wait_for_spacebar(self) -> np.ndarray | None:
"""Wait for spacebar press, record while held, return audio on release."""
audio_result = None
recording_done = threading.Event()
def on_press(key):
if key == keyboard.Key.space:
self._start_recording()
def on_release(key):
nonlocal audio_result
if key == keyboard.Key.space and self._recording:
audio_result = self._stop_recording()
recording_done.set()
return False # Stop listener
# Start audio stream
self._stream = sd.InputStream(
samplerate=SAMPLE_RATE,
channels=1,
dtype="float32",
callback=self._audio_callback,
blocksize=int(SAMPLE_RATE * 0.1), # 100ms blocks
)
with self._stream:
print("\n⏳ Press and hold SPACE to speak...")
with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
# Wait for recording to complete or timeout
recording_done.wait(timeout=self.max_record_seconds)
if self._recording:
audio_result = self._stop_recording()
return audio_result
def transcribe(self, audio: np.ndarray) -> str:
start = time.perf_counter()
inputs = self.stt_processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
input_features = inputs.input_features.to(self.device, dtype=self.dtype)
tokens = self.stt_model.generate(input_features)
text = self.stt_processor.batch_decode(tokens, skip_special_tokens=True)[0]
print(f"STT: {time.perf_counter() - start:.2f}s")
return text.strip()
def _create_dummy_images(self, batch_size: int = 1) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
"""Create placeholder images for Pi0.5 when no camera is available."""
image_shape = (batch_size, 3, 224, 224)
dummy_image = torch.zeros(image_shape, dtype=torch.float32, device=self.device)
dummy_mask = torch.ones(batch_size, dtype=torch.bool, device=self.device)
return [dummy_image], [dummy_mask]
def _tokenize_prompt(self, text: str) -> tuple[torch.Tensor, torch.Tensor]:
"""Tokenize the user prompt for Pi0.5."""
prompt = f"User request: {text}\nRobot response:"
tokenized = self.tokenizer(
[prompt],
max_length=200,
truncation=True,
padding="max_length",
return_tensors="pt",
)
tokens = tokenized["input_ids"].to(self.device)
masks = tokenized["attention_mask"].to(self.device, dtype=torch.bool)
return tokens, masks
def generate_response(self, user_text: str) -> str:
"""Generate robot utterance using Pi0.5's language generation."""
start = time.perf_counter()
images, img_masks = self._create_dummy_images()
tokens, masks = self._tokenize_prompt(user_text)
with torch.no_grad():
generated_tokens = self.pi05_model._generate_subtask_tokens(
images=images,
img_masks=img_masks,
tokens=tokens,
masks=masks,
tokenizer=self.tokenizer,
max_length=self.max_response_tokens,
device=self.device,
)
# Decode generated tokens
valid_tokens = generated_tokens[0][generated_tokens[0] != 0]
response = self.tokenizer.decode(valid_tokens, skip_special_tokens=True)
# Extract utterance if marked with special tokens
response = self._extract_utterance(response)
print(f"Pi0.5: {time.perf_counter() - start:.2f}s")
return response.strip()
def _extract_utterance(self, text: str) -> str:
"""Extract utterance from between <utterance> tokens if present."""
pattern = r"<utterance>(.*?)</utterance>"
match = re.search(pattern, text, re.DOTALL)
if match:
return match.group(1).strip()
return text
def speak(self, text: str):
start = time.perf_counter()
if self.tts_type == "kokoro":
generator = self.tts_pipeline(text, voice=self.tts_voice)
audio_chunks = [audio for _, _, audio in generator]
if audio_chunks:
audio = np.concatenate(audio_chunks)
sd.play(audio, 24000)
sd.wait()
else:
subprocess.run(["say", text], check=True)
print(f"TTS: {time.perf_counter() - start:.2f}s")
def run(self):
print("=" * 50)
print("Pi0.5 Voice Assistant")
print("=" * 50)
print("• Hold SPACE to record your voice command")
print("• Release SPACE when done speaking")
print("• Press Ctrl+C to exit")
print("=" * 50)
while True:
try:
audio = self.wait_for_spacebar()
if audio is None:
print("(no audio captured)\n")
continue
user_text = self.transcribe(audio)
if not user_text:
print("(no speech detected)\n")
continue
print(f"You: {user_text}")
response = self.generate_response(user_text)
print(f"Robot: {response}\n")
self.speak(response)
except KeyboardInterrupt:
print("\nGoodbye!")
break
def main():
parser = argparse.ArgumentParser(description="Pi0.5 Voice Assistant")
parser.add_argument(
"--pretrained_path",
type=str,
default=None,
help="Path to pretrained Pi0.5 model (optional)",
)
parser.add_argument(
"--max_response_tokens",
type=int,
default=100,
help="Maximum tokens in generated response",
)
parser.add_argument(
"--max_record_seconds",
type=float,
default=30.0,
help="Maximum recording duration in seconds",
)
args = parser.parse_args()
assistant = Pi05VoiceAssistant(
pretrained_path=args.pretrained_path,
max_response_tokens=args.max_response_tokens,
max_record_seconds=args.max_record_seconds,
)
assistant.run()
if __name__ == "__main__":
main()