From af2f044f5aa6215944f04585d1d9b32324b75107 Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Sat, 9 Aug 2025 01:14:43 +0200
Subject: [PATCH] feat(rolling vstack): opting for an inplace copy efficient
 implementation of the rolling vstack for the audio buffer

---
 src/lerobot/scripts/lerobot_record.py | 20 +++------------
 src/lerobot/utils/audio_utils.py      | 37 +++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 17 deletions(-)
 create mode 100644 src/lerobot/utils/audio_utils.py

diff --git a/src/lerobot/scripts/lerobot_record.py b/src/lerobot/scripts/lerobot_record.py
index 2027e8fcf..75cb2b64e 100644
--- a/src/lerobot/scripts/lerobot_record.py
+++ b/src/lerobot/scripts/lerobot_record.py
@@ -136,6 +136,7 @@ from lerobot.teleoperators import (  # noqa: F401
     so_leader,
 )
 from lerobot.teleoperators.keyboard.teleop_keyboard import KeyboardTeleop
+from lerobot.utils.audio_utils import rolling_vstack
 from lerobot.utils.constants import ACTION, OBS_STR
 from lerobot.utils.control_utils import (
     init_keyboard_listener,
@@ -358,17 +359,9 @@ def record_loop(
         # (1) ensure that the audio buffers are filled with enough data
         # (2) add additional initial samples to the dataset in case of variable audio chunk duration during training
         busy_wait(DEFAULT_INITIAL_AUDIO_BUFFER_DURATION)
-
         for microphone_name, microphone in robot.microphones.items():
             audio_chunk = microphone.read()
-
-            buffer_size = audio_buffer[microphone_name].shape[0]
-            # Remove as many old audio samples as needed
-            audio_buffer[microphone_name] = audio_buffer[microphone_name][len(audio_chunk) :]
-            # Add new audio samples, only the newest if the buffer is already full
-            audio_buffer[microphone_name] = np.vstack(
-                (audio_buffer[microphone_name], audio_chunk[-buffer_size:])
-            )
+            audio_buffer[microphone_name] = rolling_vstack(audio_buffer[microphone_name], audio_chunk)
 
     timestamp = 0
     start_episode_t = time.perf_counter()
@@ -393,15 +386,8 @@ def record_loop(
             # Transform instantaneous audio samples into a buffer of fixed size
             buffered_observation_frame = copy(observation_frame)
             for name in audio_buffer:
-                buffer_size = audio_buffer[name].shape[0]
-                # Remove as many old audio samples as needed
-                audio_buffer[name] = audio_buffer[name][len(buffered_observation_frame[name]) :]
-                # Add new audio samples
-                audio_buffer[name] = np.vstack(
-                    (audio_buffer[name], buffered_observation_frame[name][-buffer_size:])
-                )
                 # Add the audio buffer to the observation
-                buffered_observation_frame[name] = audio_buffer[name]
+                buffered_observation_frame[name] = rolling_vstack(audio_buffer[name], observation_frame[name])
 
             action_values = predict_action(
                 observation=buffered_observation_frame,
diff --git a/src/lerobot/utils/audio_utils.py b/src/lerobot/utils/audio_utils.py
new file mode 100644
index 000000000..f4b4a216a
--- /dev/null
+++ b/src/lerobot/utils/audio_utils.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def rolling_vstack(buffer: np.ndarray, new_data: np.ndarray) -> np.ndarray:
+    """
+    Rolling implementation of numpy.vstack to add new data in at the end of a fixed shape buffer in a rolling fashion.
+
+    Args:
+        buffer: The *fixed* shape buffer to update.
+        new_data: The new data to add to the buffer.
+
+    Returns:
+        The updated buffer.
+    """
+
+    buffer_size = buffer.shape[0]
+    # Remove as many old audio samples as needed
+    buffer[: -len(new_data)] = buffer[len(new_data) :]
+    # Add new audio samples, only the newest if the buffer is already full
+    buffer[-len(new_data) :] = new_data[-buffer_size:]
+    return buffer