mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-21 11:39:50 +00:00
feat(annotate): parallelize episodes within each module phase
Saturates parallel_servers + client_concurrency. Previously the executor processed one episode at a time, so each Module 1 episode's 3-5 dependent VLM calls hit a single server with the others idle. Now defaults to 16 episodes in flight; configurable via ExecutorConfig.episode_parallelism. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -162,6 +162,13 @@ class ExecutorConfig:
|
|||||||
slurm_gpus: int = 1
|
slurm_gpus: int = 1
|
||||||
slurm_time: str = "06:00:00"
|
slurm_time: str = "06:00:00"
|
||||||
workers: int = 1
|
workers: int = 1
|
||||||
|
episode_parallelism: int = 16
|
||||||
|
"""Number of episodes processed concurrently within each module phase.
|
||||||
|
Each in-flight episode sends 3–5 dependent VLM calls; bumping this is
|
||||||
|
how you actually saturate ``parallel_servers`` and ``client_concurrency``
|
||||||
|
— without it, the executor loops one episode at a time and the
|
||||||
|
inference servers sit ~90% idle. Set to ``1`` for strict serial
|
||||||
|
execution."""
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -136,25 +136,48 @@ class Executor:
|
|||||||
module: Any,
|
module: Any,
|
||||||
) -> PhaseResult:
|
) -> PhaseResult:
|
||||||
import time as _time # noqa: PLC0415
|
import time as _time # noqa: PLC0415
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed # noqa: PLC0415
|
||||||
|
|
||||||
if not module.enabled:
|
if not module.enabled:
|
||||||
print(f"[annotate] phase={name} skipped (module disabled)", flush=True)
|
print(f"[annotate] phase={name} skipped (module disabled)", flush=True)
|
||||||
return PhaseResult(name=name, episodes_processed=0, episodes_skipped=len(records))
|
return PhaseResult(name=name, episodes_processed=0, episodes_skipped=len(records))
|
||||||
n = len(records)
|
n = len(records)
|
||||||
print(f"[annotate] phase={name} starting on {n} episode(s)", flush=True)
|
parallelism = max(1, min(self.config.executor.episode_parallelism, n))
|
||||||
|
print(
|
||||||
|
f"[annotate] phase={name} starting on {n} episode(s) "
|
||||||
|
f"(parallelism={parallelism})",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
t0 = _time.time()
|
t0 = _time.time()
|
||||||
processed = 0
|
|
||||||
for i, record in enumerate(records, 1):
|
def _do(idx_record: tuple[int, EpisodeRecord]) -> tuple[int, int, float]:
|
||||||
|
i, record = idx_record
|
||||||
ep_start = _time.time()
|
ep_start = _time.time()
|
||||||
staging = EpisodeStaging(staging_dir, record.episode_index)
|
staging = EpisodeStaging(staging_dir, record.episode_index)
|
||||||
module.run_episode(record, staging)
|
module.run_episode(record, staging)
|
||||||
processed += 1
|
return i, record.episode_index, _time.time() - ep_start
|
||||||
elapsed = _time.time() - ep_start
|
|
||||||
print(
|
processed = 0
|
||||||
f"[annotate] {name} episode {i}/{n} "
|
if parallelism == 1:
|
||||||
f"(idx={record.episode_index}) done in {elapsed:.1f}s",
|
for i, record in enumerate(records, 1):
|
||||||
flush=True,
|
_, ep_idx, elapsed = _do((i, record))
|
||||||
)
|
processed += 1
|
||||||
|
print(
|
||||||
|
f"[annotate] {name} episode {i}/{n} "
|
||||||
|
f"(idx={ep_idx}) done in {elapsed:.1f}s",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
with ThreadPoolExecutor(max_workers=parallelism) as pool:
|
||||||
|
futures = [pool.submit(_do, (i, r)) for i, r in enumerate(records, 1)]
|
||||||
|
for fut in as_completed(futures):
|
||||||
|
i, ep_idx, elapsed = fut.result()
|
||||||
|
processed += 1
|
||||||
|
print(
|
||||||
|
f"[annotate] {name} episode {processed}/{n} "
|
||||||
|
f"(idx={ep_idx}, submit_order={i}) done in {elapsed:.1f}s",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
total = _time.time() - t0
|
total = _time.time() - t0
|
||||||
print(f"[annotate] phase={name} complete: {processed}/{n} in {total:.1f}s", flush=True)
|
print(f"[annotate] phase={name} complete: {processed}/{n} in {total:.1f}s", flush=True)
|
||||||
return PhaseResult(name=name, episodes_processed=processed, episodes_skipped=0)
|
return PhaseResult(name=name, episodes_processed=processed, episodes_skipped=0)
|
||||||
|
|||||||
Reference in New Issue
Block a user