mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-16 00:59:46 +00:00
fix(eval): prefetch next task's workers after close to avoid GPU memory overlap
Previously, next task's AsyncVectorEnv workers were spawned while the current task was still running, causing both tasks' GPU contexts to coexist. Moving the prefetch start into the finally block (after env.close()) ensures workers for task N+1 only spin up once task N has released GPU memory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -767,19 +767,19 @@ def eval_policy_all(
|
||||
prefetch_thread.join()
|
||||
prefetch_thread = None
|
||||
|
||||
# Prefetch next task's AsyncVectorEnv workers while this task runs.
|
||||
if i + 1 < len(tasks):
|
||||
next_env = tasks[i + 1][2]
|
||||
if hasattr(next_env, "_ensure"):
|
||||
prefetch_thread = threading.Thread(target=next_env._ensure, daemon=True)
|
||||
prefetch_thread.start()
|
||||
|
||||
try:
|
||||
tg, tid, metrics = task_runner(task_group, task_id, env)
|
||||
_accumulate_to(tg, metrics)
|
||||
per_task_infos.append({"task_group": tg, "task_id": tid, "metrics": metrics})
|
||||
finally:
|
||||
env.close()
|
||||
# Prefetch next task's workers *after* closing current env to prevent
|
||||
# GPU memory overlap between consecutive tasks.
|
||||
if i + 1 < len(tasks):
|
||||
next_env = tasks[i + 1][2]
|
||||
if hasattr(next_env, "_ensure"):
|
||||
prefetch_thread = threading.Thread(target=next_env._ensure, daemon=True)
|
||||
prefetch_thread.start()
|
||||
else:
|
||||
with cf.ThreadPoolExecutor(max_workers=max_parallel_tasks) as executor:
|
||||
fut2meta = {}
|
||||
|
||||
Reference in New Issue
Block a user