mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-17 16:27:04 +00:00
68fa5d80b0
- examples/scaling/train_streaming_multinode.py: Accelerate-based distributed/ resumable streaming training (no DistributedSampler; rank/world_size auto-resolved), checkpoints the dataset stream state, and supports a --dummy pure-dataloading path with throughput logging. SLURM launcher in slurm/train_streaming_robocasa.sh. - benchmarks/streaming/benchmark_streaming.py: dummy-consumer dataloading benchmark (single / sarm frame modes) emitting frames/s/node, p50/p95/p99 sample latency, first-batch latency, and VideoDecoderCache reuse stats as JSON + CSV. SLURM launcher + README documenting the source/node/mode matrix and manual bucket prewarming. - VideoDecoderCache: add hit/miss/eviction counters and a stats() method so the benchmark can surface decoder thrash (no new cache, no eviction-policy change). - tests/datasets/test_streaming_distributed.py: accelerate-launch smoke test asserting per-rank disjointness; skips (does not false-pass) when <2 processes spawn. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
101 lines
3.6 KiB
Python
101 lines
3.6 KiB
Python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""End-to-end distributed streaming smoke test under a real `accelerate launch`.
|
|
|
|
Mirrors tests/training/test_multi_gpu.py but runs on CPU and only checks the dataloading contract: with
|
|
two processes, `split_dataset_by_node` (auto-resolved from the Accelerate state) must give each rank a
|
|
disjoint set of frames that together cover the dataset. Skips if the environment can't actually spawn
|
|
>= 2 processes (e.g. local macOS multi-CPU), so it never silently passes as a single process.
|
|
"""
|
|
|
|
import json
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
|
|
import pytest
|
|
|
|
pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
|
|
pytest.importorskip("accelerate", reason="accelerate is required (install lerobot[training])")
|
|
|
|
from tests.fixtures.constants import DUMMY_REPO_ID
|
|
|
|
WORKER = """
|
|
import json, sys
|
|
from accelerate import PartialState
|
|
from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset
|
|
|
|
root, repo_id, out_dir = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
state = PartialState()
|
|
ds = StreamingLeRobotDataset(
|
|
repo_id=repo_id, root=root, shuffle=False, buffer_size=8, max_num_shards=8
|
|
)
|
|
indices = [int(frame["index"]) for frame in ds]
|
|
payload = {"rank": state.process_index, "world": state.num_processes, "indices": indices}
|
|
with open(f"{out_dir}/rank_{state.process_index}.json", "w") as f:
|
|
json.dump(payload, f)
|
|
"""
|
|
|
|
|
|
@pytest.mark.skipif(shutil.which("accelerate") is None, reason="accelerate CLI not available")
|
|
def test_accelerate_launch_ranks_are_disjoint(tmp_path, lerobot_dataset_factory):
|
|
total_frames = 160
|
|
repo_id = f"{DUMMY_REPO_ID}-acc"
|
|
root = tmp_path / "ds"
|
|
lerobot_dataset_factory(
|
|
root=root,
|
|
repo_id=repo_id,
|
|
total_episodes=8,
|
|
total_frames=total_frames,
|
|
use_videos=False,
|
|
data_files_size_in_mb=0.001,
|
|
chunks_size=1,
|
|
)
|
|
|
|
worker = tmp_path / "worker.py"
|
|
worker.write_text(WORKER)
|
|
out_dir = tmp_path / "out"
|
|
out_dir.mkdir()
|
|
|
|
cmd = [
|
|
"accelerate",
|
|
"launch",
|
|
"--num_processes=2",
|
|
"--num_machines=1",
|
|
"--mixed_precision=no",
|
|
"--dynamo_backend=no",
|
|
"--cpu",
|
|
str(worker),
|
|
str(root),
|
|
repo_id,
|
|
str(out_dir),
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
|
|
assert result.returncode == 0, (
|
|
f"accelerate launch failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
|
|
)
|
|
|
|
payloads = [json.loads(p.read_text()) for p in sorted(out_dir.glob("rank_*.json"))]
|
|
if len(payloads) < 2 or any(p["world"] < 2 for p in payloads):
|
|
pytest.skip("environment did not spawn >= 2 distributed processes (e.g. local macOS multi-CPU)")
|
|
|
|
rank_sets = [set(p["indices"]) for p in payloads]
|
|
assert rank_sets[0].isdisjoint(rank_sets[1]), "ranks streamed overlapping frames under accelerate launch"
|
|
assert set().union(*rank_sets) == set(range(total_frames)), "ranks did not jointly cover all frames"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(pytest.main([__file__, "-v"]))
|