feat(rolling vstack): opting for an inplace copy efficient implementation of the rolling vstack for the audio buffer

2026-06-18 16:57:12 +00:00 · 2025-08-09 01:14:43 +02:00
parent 0caba222ef
commit af2f044f5a
2 changed files with 40 additions and 17 deletions
@@ -136,6 +136,7 @@ from lerobot.teleoperators import (  # noqa: F401
    so_leader,
 )
 from lerobot.teleoperators.keyboard.teleop_keyboard import KeyboardTeleop
+from lerobot.utils.audio_utils import rolling_vstack
 from lerobot.utils.constants import ACTION, OBS_STR
 from lerobot.utils.control_utils import (
    init_keyboard_listener,
@@ -358,17 +359,9 @@ def record_loop(
        # (1) ensure that the audio buffers are filled with enough data
        # (2) add additional initial samples to the dataset in case of variable audio chunk duration during training
        busy_wait(DEFAULT_INITIAL_AUDIO_BUFFER_DURATION)
-
        for microphone_name, microphone in robot.microphones.items():
            audio_chunk = microphone.read()
-
-            buffer_size = audio_buffer[microphone_name].shape[0]
-            # Remove as many old audio samples as needed
-            audio_buffer[microphone_name] = audio_buffer[microphone_name][len(audio_chunk) :]
-            # Add new audio samples, only the newest if the buffer is already full
-            audio_buffer[microphone_name] = np.vstack(
-                (audio_buffer[microphone_name], audio_chunk[-buffer_size:])
-            )
+            audio_buffer[microphone_name] = rolling_vstack(audio_buffer[microphone_name], audio_chunk)

    timestamp = 0
    start_episode_t = time.perf_counter()
@@ -393,15 +386,8 @@ def record_loop(
            # Transform instantaneous audio samples into a buffer of fixed size
            buffered_observation_frame = copy(observation_frame)
            for name in audio_buffer:
-                buffer_size = audio_buffer[name].shape[0]
-                # Remove as many old audio samples as needed
-                audio_buffer[name] = audio_buffer[name][len(buffered_observation_frame[name]) :]
-                # Add new audio samples
-                audio_buffer[name] = np.vstack(
-                    (audio_buffer[name], buffered_observation_frame[name][-buffer_size:])
-                )
                # Add the audio buffer to the observation
-                buffered_observation_frame[name] = audio_buffer[name]
+                buffered_observation_frame[name] = rolling_vstack(audio_buffer[name], observation_frame[name])

            action_values = predict_action(
                observation=buffered_observation_frame,
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def rolling_vstack(buffer: np.ndarray, new_data: np.ndarray) -> np.ndarray:
+    """
+    Rolling implementation of numpy.vstack to add new data in at the end of a fixed shape buffer in a rolling fashion.
+
+    Args:
+        buffer: The *fixed* shape buffer to update.
+        new_data: The new data to add to the buffer.
+
+    Returns:
+        The updated buffer.
+    """
+
+    buffer_size = buffer.shape[0]
+    # Remove as many old audio samples as needed
+    buffer[: -len(new_data)] = buffer[len(new_data) :]
+    # Add new audio samples, only the newest if the buffer is already full
+    buffer[-len(new_data) :] = new_data[-buffer_size:]
+    return buffer