From af2f044f5aa6215944f04585d1d9b32324b75107 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Sat, 9 Aug 2025 01:14:43 +0200 Subject: [PATCH] feat(rolling vstack): opting for an inplace copy efficient implementation of the rolling vstack for the audio buffer --- src/lerobot/scripts/lerobot_record.py | 20 +++------------ src/lerobot/utils/audio_utils.py | 37 +++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 17 deletions(-) create mode 100644 src/lerobot/utils/audio_utils.py diff --git a/src/lerobot/scripts/lerobot_record.py b/src/lerobot/scripts/lerobot_record.py index 2027e8fcf..75cb2b64e 100644 --- a/src/lerobot/scripts/lerobot_record.py +++ b/src/lerobot/scripts/lerobot_record.py @@ -136,6 +136,7 @@ from lerobot.teleoperators import ( # noqa: F401 so_leader, ) from lerobot.teleoperators.keyboard.teleop_keyboard import KeyboardTeleop +from lerobot.utils.audio_utils import rolling_vstack from lerobot.utils.constants import ACTION, OBS_STR from lerobot.utils.control_utils import ( init_keyboard_listener, @@ -358,17 +359,9 @@ def record_loop( # (1) ensure that the audio buffers are filled with enough data # (2) add additional initial samples to the dataset in case of variable audio chunk duration during training busy_wait(DEFAULT_INITIAL_AUDIO_BUFFER_DURATION) - for microphone_name, microphone in robot.microphones.items(): audio_chunk = microphone.read() - - buffer_size = audio_buffer[microphone_name].shape[0] - # Remove as many old audio samples as needed - audio_buffer[microphone_name] = audio_buffer[microphone_name][len(audio_chunk) :] - # Add new audio samples, only the newest if the buffer is already full - audio_buffer[microphone_name] = np.vstack( - (audio_buffer[microphone_name], audio_chunk[-buffer_size:]) - ) + audio_buffer[microphone_name] = rolling_vstack(audio_buffer[microphone_name], audio_chunk) timestamp = 0 start_episode_t = time.perf_counter() @@ -393,15 +386,8 @@ def record_loop( # Transform instantaneous audio samples into a buffer of fixed size buffered_observation_frame = copy(observation_frame) for name in audio_buffer: - buffer_size = audio_buffer[name].shape[0] - # Remove as many old audio samples as needed - audio_buffer[name] = audio_buffer[name][len(buffered_observation_frame[name]) :] - # Add new audio samples - audio_buffer[name] = np.vstack( - (audio_buffer[name], buffered_observation_frame[name][-buffer_size:]) - ) # Add the audio buffer to the observation - buffered_observation_frame[name] = audio_buffer[name] + buffered_observation_frame[name] = rolling_vstack(audio_buffer[name], observation_frame[name]) action_values = predict_action( observation=buffered_observation_frame, diff --git a/src/lerobot/utils/audio_utils.py b/src/lerobot/utils/audio_utils.py new file mode 100644 index 000000000..f4b4a216a --- /dev/null +++ b/src/lerobot/utils/audio_utils.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +def rolling_vstack(buffer: np.ndarray, new_data: np.ndarray) -> np.ndarray: + """ + Rolling implementation of numpy.vstack to add new data in at the end of a fixed shape buffer in a rolling fashion. + + Args: + buffer: The *fixed* shape buffer to update. + new_data: The new data to add to the buffer. + + Returns: + The updated buffer. + """ + + buffer_size = buffer.shape[0] + # Remove as many old audio samples as needed + buffer[: -len(new_data)] = buffer[len(new_data) :] + # Add new audio samples, only the newest if the buffer is already full + buffer[-len(new_data) :] = new_data[-buffer_size:] + return buffer